Libraries
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggthemes)
library(caTools)
library(corrplot)
## corrplot 0.92 loaded
library(corrgram)
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
##
## panel.fill
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
library(rpart.plot)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
library(class)
library(missForest)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(performance)
library(DataExplorer)
library(lattice)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(ROSE)
## Loaded ROSE 0.0-4
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine(), randomForest::combine()
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
library(xgboost)
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
library(officer)
library(vipor)
library(ggplot2)
library(ggbeeswarm)
Import data
data = read.csv("diabetic_data.csv", na.string = "?", header = T, sep = ",")
head(data)
## encounter_id patient_nbr race gender age weight
## 1 2278392 8222157 Caucasian Female [0-10) <NA>
## 2 149190 55629189 Caucasian Female [10-20) <NA>
## 3 64410 86047875 AfricanAmerican Female [20-30) <NA>
## 4 500364 82442376 Caucasian Male [30-40) <NA>
## 5 16680 42519267 Caucasian Male [40-50) <NA>
## 6 35754 82637451 Caucasian Male [50-60) <NA>
## admission_type_id discharge_disposition_id admission_source_id
## 1 6 25 1
## 2 1 1 7
## 3 1 1 7
## 4 1 1 7
## 5 1 1 7
## 6 2 1 2
## time_in_hospital payer_code medical_specialty num_lab_procedures
## 1 1 <NA> Pediatrics-Endocrinology 41
## 2 3 <NA> <NA> 59
## 3 2 <NA> <NA> 11
## 4 2 <NA> <NA> 44
## 5 1 <NA> <NA> 51
## 6 3 <NA> <NA> 31
## num_procedures num_medications number_outpatient number_emergency
## 1 0 1 0 0
## 2 0 18 0 0
## 3 5 13 2 0
## 4 1 16 0 0
## 5 0 8 0 0
## 6 6 16 0 0
## number_inpatient diag_1 diag_2 diag_3 number_diagnoses max_glu_serum
## 1 0 250.83 <NA> <NA> 1 None
## 2 0 276 250.01 255 9 None
## 3 1 648 250 V27 6 None
## 4 0 8 250.43 403 7 None
## 5 0 197 157 250 5 None
## 6 0 414 411 250 9 None
## A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride
## 1 None No No No No No
## 2 None No No No No No
## 3 None No No No No No
## 4 None No No No No No
## 5 None No No No No No
## 6 None No No No No No
## acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone
## 1 No No No No No No
## 2 No No No No No No
## 3 No Steady No No No No
## 4 No No No No No No
## 5 No Steady No No No No
## 6 No No No No No No
## acarbose miglitol troglitazone tolazamide examide citoglipton insulin
## 1 No No No No No No No
## 2 No No No No No No Up
## 3 No No No No No No No
## 4 No No No No No No Up
## 5 No No No No No No Steady
## 6 No No No No No No Steady
## glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 1 No No No
## 2 No No No
## 3 No No No
## 4 No No No
## 5 No No No
## 6 No No No
## metformin.rosiglitazone metformin.pioglitazone change diabetesMed readmitted
## 1 No No No No NO
## 2 No No Ch Yes >30
## 3 No No No Yes NO
## 4 No No Ch Yes NO
## 5 No No Ch Yes NO
## 6 No No No Yes >30
Data Cleaning
#Structure
str(data)
## 'data.frame': 101766 obs. of 50 variables:
## $ encounter_id : int 2278392 149190 64410 500364 16680 35754 55842 63768 12522 15738 ...
## $ patient_nbr : int 8222157 55629189 86047875 82442376 42519267 82637451 84259809 114882984 48330783 63555939 ...
## $ race : chr "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
## $ gender : chr "Female" "Female" "Female" "Male" ...
## $ age : chr "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
## $ weight : chr NA NA NA NA ...
## $ admission_type_id : int 6 1 1 1 1 2 3 1 2 3 ...
## $ discharge_disposition_id: int 25 1 1 1 1 1 1 1 1 3 ...
## $ admission_source_id : int 1 7 7 7 7 2 2 7 4 4 ...
## $ time_in_hospital : int 1 3 2 2 1 3 4 5 13 12 ...
## $ payer_code : chr NA NA NA NA ...
## $ medical_specialty : chr "Pediatrics-Endocrinology" NA NA NA ...
## $ num_lab_procedures : int 41 59 11 44 51 31 70 73 68 33 ...
## $ num_procedures : int 0 0 5 1 0 6 1 0 2 3 ...
## $ num_medications : int 1 18 13 16 8 16 21 12 28 18 ...
## $ number_outpatient : int 0 0 2 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 1 0 0 0 0 0 0 0 ...
## $ diag_1 : chr "250.83" "276" "648" "8" ...
## $ diag_2 : chr NA "250.01" "250" "250.43" ...
## $ diag_3 : chr NA "255" "V27" "403" ...
## $ number_diagnoses : int 1 9 6 7 5 9 7 8 8 8 ...
## $ max_glu_serum : chr "None" "None" "None" "None" ...
## $ A1Cresult : chr "None" "None" "None" "None" ...
## $ metformin : chr "No" "No" "No" "No" ...
## $ repaglinide : chr "No" "No" "No" "No" ...
## $ nateglinide : chr "No" "No" "No" "No" ...
## $ chlorpropamide : chr "No" "No" "No" "No" ...
## $ glimepiride : chr "No" "No" "No" "No" ...
## $ acetohexamide : chr "No" "No" "No" "No" ...
## $ glipizide : chr "No" "No" "Steady" "No" ...
## $ glyburide : chr "No" "No" "No" "No" ...
## $ tolbutamide : chr "No" "No" "No" "No" ...
## $ pioglitazone : chr "No" "No" "No" "No" ...
## $ rosiglitazone : chr "No" "No" "No" "No" ...
## $ acarbose : chr "No" "No" "No" "No" ...
## $ miglitol : chr "No" "No" "No" "No" ...
## $ troglitazone : chr "No" "No" "No" "No" ...
## $ tolazamide : chr "No" "No" "No" "No" ...
## $ examide : chr "No" "No" "No" "No" ...
## $ citoglipton : chr "No" "No" "No" "No" ...
## $ insulin : chr "No" "Up" "No" "Up" ...
## $ glyburide.metformin : chr "No" "No" "No" "No" ...
## $ glipizide.metformin : chr "No" "No" "No" "No" ...
## $ glimepiride.pioglitazone: chr "No" "No" "No" "No" ...
## $ metformin.rosiglitazone : chr "No" "No" "No" "No" ...
## $ metformin.pioglitazone : chr "No" "No" "No" "No" ...
## $ change : chr "No" "Ch" "No" "Ch" ...
## $ diabetesMed : chr "No" "Yes" "Yes" "Yes" ...
## $ readmitted : chr "NO" ">30" "NO" "NO" ...
#Feature list and corresponding feature numbers
col_numbers <- cbind(colnames(data))
col_numbers
## [,1]
## [1,] "encounter_id"
## [2,] "patient_nbr"
## [3,] "race"
## [4,] "gender"
## [5,] "age"
## [6,] "weight"
## [7,] "admission_type_id"
## [8,] "discharge_disposition_id"
## [9,] "admission_source_id"
## [10,] "time_in_hospital"
## [11,] "payer_code"
## [12,] "medical_specialty"
## [13,] "num_lab_procedures"
## [14,] "num_procedures"
## [15,] "num_medications"
## [16,] "number_outpatient"
## [17,] "number_emergency"
## [18,] "number_inpatient"
## [19,] "diag_1"
## [20,] "diag_2"
## [21,] "diag_3"
## [22,] "number_diagnoses"
## [23,] "max_glu_serum"
## [24,] "A1Cresult"
## [25,] "metformin"
## [26,] "repaglinide"
## [27,] "nateglinide"
## [28,] "chlorpropamide"
## [29,] "glimepiride"
## [30,] "acetohexamide"
## [31,] "glipizide"
## [32,] "glyburide"
## [33,] "tolbutamide"
## [34,] "pioglitazone"
## [35,] "rosiglitazone"
## [36,] "acarbose"
## [37,] "miglitol"
## [38,] "troglitazone"
## [39,] "tolazamide"
## [40,] "examide"
## [41,] "citoglipton"
## [42,] "insulin"
## [43,] "glyburide.metformin"
## [44,] "glipizide.metformin"
## [45,] "glimepiride.pioglitazone"
## [46,] "metformin.rosiglitazone"
## [47,] "metformin.pioglitazone"
## [48,] "change"
## [49,] "diabetesMed"
## [50,] "readmitted"
Handling missing data
any(is.na(data))
## [1] TRUE
#plot missing data
plot_missing(data)
#Remove variables with significant missing values, non-useful variables
#removing variables:
#encounter id, pt number: identifications not required.
#weight, payor code: significant missing values
# medical specialty: unreasonable number of levels.
data <- data %>% select(-encounter_id, -patient_nbr, -weight, -payer_code, -medical_specialty)
#removing all medications
data <- data[, -(20:42)]
#check structure after removal
str(data)
## 'data.frame': 101766 obs. of 22 variables:
## $ race : chr "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
## $ gender : chr "Female" "Female" "Female" "Male" ...
## $ age : chr "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
## $ admission_type_id : int 6 1 1 1 1 2 3 1 2 3 ...
## $ discharge_disposition_id: int 25 1 1 1 1 1 1 1 1 3 ...
## $ admission_source_id : int 1 7 7 7 7 2 2 7 4 4 ...
## $ time_in_hospital : int 1 3 2 2 1 3 4 5 13 12 ...
## $ num_lab_procedures : int 41 59 11 44 51 31 70 73 68 33 ...
## $ num_procedures : int 0 0 5 1 0 6 1 0 2 3 ...
## $ num_medications : int 1 18 13 16 8 16 21 12 28 18 ...
## $ number_outpatient : int 0 0 2 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 1 0 0 0 0 0 0 0 ...
## $ diag_1 : chr "250.83" "276" "648" "8" ...
## $ diag_2 : chr NA "250.01" "250" "250.43" ...
## $ diag_3 : chr NA "255" "V27" "403" ...
## $ number_diagnoses : int 1 9 6 7 5 9 7 8 8 8 ...
## $ max_glu_serum : chr "None" "None" "None" "None" ...
## $ A1Cresult : chr "None" "None" "None" "None" ...
## $ change : chr "No" "Ch" "No" "Ch" ...
## $ diabetesMed : chr "No" "Yes" "Yes" "Yes" ...
## $ readmitted : chr "NO" ">30" "NO" "NO" ...
#check for NAs in retained variables
any(is.na(data))
## [1] TRUE
#plot missing data
plot_missing(data)
#remove observations with NAs
data <- na.omit(data)
#check
plot_missing(data)
Data Conversion
#all chr variables to factor
data[] <- lapply(data, function(x) if(is.character(x)) as.factor(x) else x)
#variables admission_type_id, admission_type_id, admission_source_id to factor
data$discharge_disposition_id <- as.factor(data$discharge_disposition_id)
data$admission_type_id <- as.factor(data$admission_type_id)
data$admission_source_id <- as.factor(data$admission_source_id)
#check conversion
str(data)
## 'data.frame': 98053 obs. of 22 variables:
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 3 3 3 3 3 3 1 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 1 1 2 2 2 2 2 1 1 1 ...
## $ age : Factor w/ 10 levels "[0-10)","[10-20)",..: 2 3 4 5 6 7 8 9 10 5 ...
## $ admission_type_id : Factor w/ 8 levels "1","2","3","4",..: 1 1 1 1 2 3 1 2 3 1 ...
## $ discharge_disposition_id: Factor w/ 26 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 3 1 ...
## $ admission_source_id : Factor w/ 17 levels "1","2","3","4",..: 7 7 7 7 2 2 7 4 4 7 ...
## $ time_in_hospital : int 3 2 2 1 3 4 5 13 12 9 ...
## $ num_lab_procedures : int 59 11 44 51 31 70 73 68 33 47 ...
## $ num_procedures : int 0 5 1 0 6 1 0 2 3 2 ...
## $ num_medications : int 18 13 16 8 16 21 12 28 18 17 ...
## $ number_outpatient : int 0 2 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 1 0 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 713 levels "10","11","110",..: 144 455 554 55 264 264 277 253 283 121 ...
## $ diag_2 : Factor w/ 740 levels "11","110","112",..: 78 77 96 24 245 245 313 259 46 240 ...
## $ diag_3 : Factor w/ 786 levels "11","110","111",..: 122 764 249 87 87 768 87 230 317 666 ...
## $ number_diagnoses : int 9 6 7 5 9 7 8 8 8 9 ...
## $ max_glu_serum : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ A1Cresult : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ change : Factor w/ 2 levels "Ch","No": 1 2 1 1 2 1 2 1 1 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ readmitted : Factor w/ 3 levels "<30",">30","NO": 2 3 3 3 2 3 2 3 3 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:3713] 1 20 21 22 55 66 67 88 100 112 ...
## ..- attr(*, "names")= chr [1:3713] "1" "20" "21" "22" ...
Re-factor response variable
#Function to refactor
binary = function(x){
if(x == ">30" | x == "<30"){
return("Yes")
}else{return("No")}
}
#apply fucntion to response variable
data$readmitted <- sapply(data[, c('readmitted')], function(x) binary(x))
#check reponse variable levels
table(data$readmitted)
##
## No Yes
## 52338 45715
#response variable to factor
data$readmitted <- as.factor(data$readmitted)
#check response variable type
class(data$readmitted)
## [1] "factor"
Remove invalid observations
#Remove "None" observations for A1C and max glucose serum. None = not measured
#Remove Unknown/Invalid observations for gender variable
data = subset(data, gender!= "Unknown/Invalid", )
data = subset(data, A1Cresult != "None", )
data = subset(data, max_glu_serum != "None", )
data = droplevels(data)
#Check if “None” observations removed
table(data$gender)
##
## Female Male
## 168 121
table(data$A1Cresult)
##
## >7 >8 Norm
## 63 171 55
table(data$max_glu_serum)
##
## >200 >300 Norm
## 69 124 96
Dummy Variables
#Creating dummy variables
dummies <- predict(dummyVars(~race + gender + age + admission_type_id + discharge_disposition_id + admission_source_id + max_glu_serum + A1Cresult + change + diabetesMed, data = data), newdata = data)
#check if dummies created
head(dummies)
## race.AfricanAmerican race.Asian race.Caucasian race.Hispanic race.Other
## 163 0 0 1 0 0
## 461 1 0 0 0 0
## 594 0 0 1 0 0
## 697 0 0 0 0 1
## 772 0 0 1 0 0
## 824 0 0 1 0 0
## gender.Female gender.Male age.[10-20) age.[20-30) age.[30-40) age.[40-50)
## 163 0 1 0 0 0 0
## 461 1 0 0 0 0 0
## 594 1 0 0 0 0 0
## 697 0 1 0 0 0 0
## 772 1 0 0 0 1 0
## 824 0 1 0 0 0 0
## age.[50-60) age.[60-70) age.[70-80) age.[80-90) admission_type_id.1
## 163 0 0 0 1 0
## 461 0 0 1 0 0
## 594 1 0 0 0 0
## 697 0 0 1 0 0
## 772 0 0 0 0 0
## 824 0 0 0 1 0
## admission_type_id.2 admission_type_id.3 admission_type_id.6
## 163 0 0 1
## 461 0 0 1
## 594 0 0 1
## 697 0 0 1
## 772 0 0 1
## 824 0 0 1
## discharge_disposition_id.1 discharge_disposition_id.2
## 163 0 0
## 461 1 0
## 594 1 0
## 697 0 0
## 772 1 0
## 824 1 0
## discharge_disposition_id.3 discharge_disposition_id.5
## 163 1 0
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.6 discharge_disposition_id.7
## 163 0 0
## 461 0 0
## 594 0 0
## 697 1 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.10 discharge_disposition_id.11
## 163 0 0
## 461 0 0
## 594 0 0
## 697 0 0
## 772 0 0
## 824 0 0
## discharge_disposition_id.13 admission_source_id.1 admission_source_id.2
## 163 0 0 0
## 461 0 0 0
## 594 0 0 0
## 697 0 0 0
## 772 0 0 1
## 824 0 0 0
## admission_source_id.7 max_glu_serum.>200 max_glu_serum.>300
## 163 1 1 0
## 461 1 0 1
## 594 1 0 1
## 697 1 1 0
## 772 0 0 0
## 824 1 0 1
## max_glu_serum.Norm A1Cresult.>7 A1Cresult.>8 A1Cresult.Norm change.Ch
## 163 0 0 0 1 0
## 461 0 0 1 0 1
## 594 0 0 1 0 0
## 697 0 1 0 0 0
## 772 1 1 0 0 0
## 824 0 1 0 0 0
## change.No diabetesMed.No diabetesMed.Yes
## 163 1 1 0
## 461 0 0 1
## 594 1 0 1
## 697 1 0 1
## 772 1 1 0
## 824 1 0 1
#rename columns
colnames(dummies)
## [1] "race.AfricanAmerican" "race.Asian"
## [3] "race.Caucasian" "race.Hispanic"
## [5] "race.Other" "gender.Female"
## [7] "gender.Male" "age.[10-20)"
## [9] "age.[20-30)" "age.[30-40)"
## [11] "age.[40-50)" "age.[50-60)"
## [13] "age.[60-70)" "age.[70-80)"
## [15] "age.[80-90)" "admission_type_id.1"
## [17] "admission_type_id.2" "admission_type_id.3"
## [19] "admission_type_id.6" "discharge_disposition_id.1"
## [21] "discharge_disposition_id.2" "discharge_disposition_id.3"
## [23] "discharge_disposition_id.5" "discharge_disposition_id.6"
## [25] "discharge_disposition_id.7" "discharge_disposition_id.10"
## [27] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [29] "admission_source_id.1" "admission_source_id.2"
## [31] "admission_source_id.7" "max_glu_serum.>200"
## [33] "max_glu_serum.>300" "max_glu_serum.Norm"
## [35] "A1Cresult.>7" "A1Cresult.>8"
## [37] "A1Cresult.Norm" "change.Ch"
## [39] "change.No" "diabetesMed.No"
## [41] "diabetesMed.Yes"
colnames(dummies)[colnames(dummies) == "A1Cresult.>7"] <- "A1Cresult_7"
colnames(dummies)[colnames(dummies) == "A1Cresult.>8"] <- "A1Cresult_8"
colnames(dummies)[colnames(dummies) == "A1Cresult.Norm"] <- "A1Cresult_Norm"
colnames(dummies)[colnames(dummies) == "max_glu_serum.>200"] <- "max_glu_serum_200"
colnames(dummies)[colnames(dummies) == "max_glu_serum.>300"] <- "max_glu_serum_300"
colnames(dummies)[colnames(dummies) == "max_glu_serum.Norm"] <- "max_glu_serum_Norm"
colnames(dummies)[colnames(dummies) == "change.Ch"] <- "med_change_yes"
colnames(dummies)[colnames(dummies) == "change.No"] <- "med_change_no"
colnames(dummies)[colnames(dummies) == "diabetesMed.No"] <- "diabetes_med_yes"
colnames(dummies)[colnames(dummies) == "diabetesMed.Yes"] <- "diabetes_med_no"
colnames(dummies)[colnames(dummies) == "age.[10-20)"] <- "age_10_20 "
colnames(dummies)[colnames(dummies) == "age.[20-30)"] <- "age_20_30"
colnames(dummies)[colnames(dummies) == "age.[30-40)"] <- "age_30_40"
colnames(dummies)[colnames(dummies) == "age.[40-50)"] <- "age_40_50"
colnames(dummies)[colnames(dummies) == "age.[50-60)"] <- "age_50_60"
colnames(dummies)[colnames(dummies) == "age.[60-70)"] <- "age_60_70"
colnames(dummies)[colnames(dummies) == "age.[70-80)"] <- "age_70_80"
colnames(dummies)[colnames(dummies) == "age.[80-90)"] <- "age_80_90"
colnames(dummies)[colnames(dummies) == "race.AfricanAmerican"] <- "race_AfricanAmerican"
colnames(dummies)[colnames(dummies) == "race.Asian"] <- "race_Asian"
colnames(dummies)[colnames(dummies) == "race.Caucasian"] <- "race_Caucasian"
colnames(dummies)[colnames(dummies) == "race.Hispanic"] <- "race_Hispanic"
colnames(dummies)[colnames(dummies) == "race.Other"] <- "race_Other"
#check column name chages
colnames(dummies)
## [1] "race_AfricanAmerican" "race_Asian"
## [3] "race_Caucasian" "race_Hispanic"
## [5] "race_Other" "gender.Female"
## [7] "gender.Male" "age_10_20 "
## [9] "age_20_30" "age_30_40"
## [11] "age_40_50" "age_50_60"
## [13] "age_60_70" "age_70_80"
## [15] "age_80_90" "admission_type_id.1"
## [17] "admission_type_id.2" "admission_type_id.3"
## [19] "admission_type_id.6" "discharge_disposition_id.1"
## [21] "discharge_disposition_id.2" "discharge_disposition_id.3"
## [23] "discharge_disposition_id.5" "discharge_disposition_id.6"
## [25] "discharge_disposition_id.7" "discharge_disposition_id.10"
## [27] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [29] "admission_source_id.1" "admission_source_id.2"
## [31] "admission_source_id.7" "max_glu_serum_200"
## [33] "max_glu_serum_300" "max_glu_serum_Norm"
## [35] "A1Cresult_7" "A1Cresult_8"
## [37] "A1Cresult_Norm" "med_change_yes"
## [39] "med_change_no" "diabetes_med_yes"
## [41] "diabetes_med_no"
#merge dummy variables for and original dataset
data <- cbind(data,dummies)
#check merging
str(data)
## 'data.frame': 289 obs. of 63 variables:
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
## $ age : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
## $ admission_type_id : Factor w/ 4 levels "1","2","3","6": 4 4 4 4 4 4 4 4 4 4 ...
## $ discharge_disposition_id : Factor w/ 9 levels "1","2","3","5",..: 3 1 1 5 1 1 1 1 1 7 ...
## $ admission_source_id : Factor w/ 3 levels "1","2","7": 3 3 3 3 2 3 3 3 3 1 ...
## $ time_in_hospital : int 5 10 2 11 14 7 2 3 2 4 ...
## $ num_lab_procedures : int 47 72 61 71 43 105 66 76 43 41 ...
## $ num_procedures : int 1 1 0 1 0 3 0 0 0 1 ...
## $ num_medications : int 6 19 5 20 11 16 3 9 13 8 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 90 levels "112","162","188",..: 24 5 19 85 22 36 6 21 71 74 ...
## $ diag_2 : Factor w/ 103 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
## $ diag_3 : Factor w/ 103 levels "198","208","211",..: 46 28 9 99 65 21 19 31 92 6 ...
## $ number_diagnoses : int 5 5 5 5 3 5 3 5 5 3 ...
## $ max_glu_serum : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
## $ A1Cresult : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
## $ readmitted : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 2 2 2 2 ...
## $ race_AfricanAmerican : num 0 1 0 0 0 0 0 0 0 0 ...
## $ race_Asian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ race_Caucasian : num 1 0 1 0 1 1 0 1 1 1 ...
## $ race_Hispanic : num 0 0 0 0 0 0 1 0 0 0 ...
## $ race_Other : num 0 0 0 1 0 0 0 0 0 0 ...
## $ gender.Female : num 0 1 1 0 1 0 1 1 1 0 ...
## $ gender.Male : num 1 0 0 1 0 1 0 0 0 1 ...
## $ age_10_20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_20_30 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_30_40 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ age_40_50 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ age_50_60 : num 0 0 1 0 0 0 1 0 1 1 ...
## $ age_60_70 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_70_80 : num 0 1 0 1 0 0 0 0 0 0 ...
## $ age_80_90 : num 1 0 0 0 0 1 0 0 0 0 ...
## $ admission_type_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.6 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ discharge_disposition_id.1 : num 0 1 1 0 1 1 1 1 1 0 ...
## $ discharge_disposition_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.3 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.6 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ discharge_disposition_id.7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.10: num 0 0 0 0 0 0 0 0 0 1 ...
## $ discharge_disposition_id.11: num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.13: num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id.1 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ admission_source_id.2 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ admission_source_id.7 : num 1 1 1 1 0 1 1 1 1 0 ...
## $ max_glu_serum_200 : num 1 0 0 1 0 0 0 0 0 1 ...
## $ max_glu_serum_300 : num 0 1 1 0 0 1 0 1 1 0 ...
## $ max_glu_serum_Norm : num 0 0 0 0 1 0 1 0 0 0 ...
## $ A1Cresult_7 : num 0 0 0 1 1 1 1 1 1 0 ...
## $ A1Cresult_8 : num 0 1 1 0 0 0 0 0 0 1 ...
## $ A1Cresult_Norm : num 1 0 0 0 0 0 0 0 0 0 ...
## $ med_change_yes : num 0 1 0 0 0 0 0 1 0 0 ...
## $ med_change_no : num 1 0 1 1 1 1 1 0 1 1 ...
## $ diabetes_med_yes : num 1 0 0 0 1 0 0 0 1 0 ...
## $ diabetes_med_no : num 0 1 1 1 0 1 1 1 0 1 ...
Bucketing and dummies of diag_1, diag_2 and diag_3
#creating new object for diag1, 2 & 3 variables and initiating categories for them
diag_data <- data
diag_data$Circulatory_diag <- 0
diag_data$Respiraoty_diag <- 0
diag_data$Digestive_diag <- 0
diag_data$Diabetes_diag <- 0
diag_data$Injury_diag <- 0
diag_data$Musktl_diag <-0
diag_data$Genitourinary_diag <- 0
diag_data$Neoplasm_diag <-0
diag_data$Other_diag <- 0
#Circulatory
diag_data$Circulatory_diag[(as.character( diag_data$diag_1) >= "390" & as.character( diag_data$diag_1) <= "459" | as.character( diag_data$diag_1 ) == "785")
| (as.character( diag_data$diag_2) >= "390" & as.character( diag_data$diag_2) <= "459" | as.character( diag_data$diag_2 ) == "785")
| (as.character( diag_data$diag_3) >= "390" & as.character( diag_data$diag_3) <= "459" | as.character( diag_data$diag_3 ) == "785")] <- 1
#Diabetes
diag_data$Diabetes_diag[(as.character(diag_data$diag_1) > "249" & as.character(diag_data$diag_1) < "251")
| (as.character(diag_data$diag_2) > "249" & as.character(diag_data$diag_2) < "251")
| (as.character(diag_data$diag_3) > "249" & as.character(diag_data$diag_3) < "251")] <- 1
#Respiratory
diag_data$Respiraoty_diag[(as.character( diag_data$diag_1) >= "460" & as.character( diag_data$diag_1) <= "519" | as.character( diag_data$diag_1 ) == "786")
| (as.character( diag_data$diag_2) >= "460" & as.character( diag_data$diag_2) <= "519" | as.character( diag_data$diag_2 ) == "786")
| (as.character( diag_data$diag_3) >= "460" & as.character( diag_data$diag_3) <= "519" | as.character( diag_data$diag_3 ) == "786")] <- 1
#Digestive
diag_data$Digestive_diag[(as.character( diag_data$diag_1) >= "520" & as.character( diag_data$diag_1) <= "579" | as.character( diag_data$diag_1 ) == "787")
| (as.character( diag_data$diag_2) >= "520" & as.character( diag_data$diag_2) <= "579" | as.character( diag_data$diag_2 ) == "787")
| (as.character( diag_data$diag_3) >= "520" & as.character( diag_data$diag_3) <= "579" | as.character( diag_data$diag_3 ) == "787")] <- 1
#Injury
diag_data$Injury_diag[(as.character( diag_data$diag_1) >= "800" & as.character( diag_data$diag_1) <= "999")
| (as.character( diag_data$diag_2) >= "800" & as.character( diag_data$diag_2) <= "999")
| (as.character( diag_data$diag_3) >= "800" & as.character( diag_data$diag_3) <= "999")] <- 1
#Musculoskeletal
diag_data$Musktl_diag[(as.character( diag_data$diag_1) >= "710" & as.character( diag_data$diag_1) <= "739")
| (as.character( diag_data$diag_2) >= "710" & as.character( diag_data$diag_2) <= "739")
| (as.character( diag_data$diag_3) >= "710" & as.character( diag_data$diag_3) <= "739")] <- 1
#Genitourinary
diag_data$Genitourinary_diag[(as.character( diag_data$diag_1) >= "580" & as.character( diag_data$diag_1) <= "629" | as.character( diag_data$diag_1 ) == "788")
| (as.character( diag_data$diag_2) >= "580" & as.character( diag_data$diag_2) <= "629" | as.character( diag_data$diag_2 ) == "788")
| (as.character( diag_data$diag_3) >= "580" & as.character( diag_data$diag_3) <= "629" | as.character( diag_data$diag_3 ) == "788")] <- 1
#Neoplasms
diag_data$Neoplasm_diag[(as.character( diag_data$diag_1) >= "140" & as.character( diag_data$diag_1) <= "239")
| (as.character( diag_data$diag_2) >= "140" & as.character( diag_data$diag_2) <= "239")
| (as.character( diag_data$diag_3) >= "140" & as.character( diag_data$diag_3) <= "239")] <- 1
#Other diagnoses
diag_data$Other_diag[(as.character( diag_data$diag_1) == "780") | (as.character( diag_data$diag_1) == "781")
| (as.character( diag_data$diag_1) == "784") | (as.character( diag_data$diag_1) >= "790" & as.character( diag_data$diag_1) <= "799")
| (as.character( diag_data$diag_1) >= "240" & as.character( diag_data$diag_1) <= "249") | (as.character( diag_data$diag_1) >= "251" & as.character( diag_data$diag_1) <= "279")
| (as.character( diag_data$diag_1) >= "680" & as.character( diag_data$diag_1) <= "709") | (as.character( diag_data$diag_1) == "782")
| (as.character( diag_data$diag_1) >= "001" & as.character( diag_data$diag_1) <= "139") | (as.character( diag_data$diag_1) >= "290" & as.character( diag_data$diag_1) <= "319")
| (as.character( diag_data$diag_1) >= "280" & as.character( diag_data$diag_1) <= "289") | (as.character( diag_data$diag_1) >= "320" & as.character( diag_data$diag_1) <= "359")
| (as.character( diag_data$diag_1) >= "630" & as.character( diag_data$diag_1) <= "679") | (as.character( diag_data$diag_1) >= "360" & as.character( diag_data$diag_1) <= "389")
| (as.character( diag_data$diag_1) >= "740" & as.character( diag_data$diag_1) <= "759")
| (startsWith(as.character( diag_data$diag_1), 'E'))
| (startsWith(as.character( diag_data$diag_1), 'V'))
| (as.character( diag_data$diag_2) == "780") | (as.character( diag_data$diag_2) == "781")
| (as.character( diag_data$diag_2) == "784") | (as.character( diag_data$diag_2) >= "790" & as.character( diag_data$diag_2) <= "799")
| (as.character( diag_data$diag_2) >= "240" & as.character( diag_data$diag_2) <= "249") | (as.character( diag_data$diag_2) >= "251" & as.character( diag_data$diag_2) <= "279")
| (as.character( diag_data$diag_2) >= "680" & as.character( diag_data$diag_2) <= "709") | (as.character( diag_data$diag_2) == "782")
| (as.character( diag_data$diag_2) >= "001" & as.character( diag_data$diag_2) <= "139") | (as.character( diag_data$diag_2) >= "290" & as.character( diag_data$diag_2) <= "319")
| (as.character( diag_data$diag_2) >= "280" & as.character( diag_data$diag_2) <= "289") | (as.character( diag_data$diag_2) >= "320" & as.character( diag_data$diag_2) <= "359")
| (as.character( diag_data$diag_2) >= "630" & as.character( diag_data$diag_2) <= "679") | (as.character( diag_data$diag_2) >= "360" & as.character( diag_data$diag_2) <= "389")
| (as.character( diag_data$diag_2) >= "740" & as.character( diag_data$diag_2) <= "759")
| (startsWith(as.character( diag_data$diag_2), 'E'))
| (startsWith(as.character( diag_data$diag_2), 'V'))
| (as.character( diag_data$diag_3) == "780") | (as.character( diag_data$diag_3) == "781")
| (as.character( diag_data$diag_3) == "784") | (as.character( diag_data$diag_3) >= "790" & as.character( diag_data$diag_3) <= "799")
| (as.character( diag_data$diag_3) >= "240" & as.character( diag_data$diag_3) <= "249") | (as.character( diag_data$diag_3) >= "251" & as.character( diag_data$diag_3) <= "279")
| (as.character( diag_data$diag_3) >= "680" & as.character( diag_data$diag_3) <= "709") | (as.character( diag_data$diag_3) == "782")
| (as.character( diag_data$diag_3) >= "001" & as.character( diag_data$diag_3) <= "139") | (as.character( diag_data$diag_3) >= "290" & as.character( diag_data$diag_3) <= "319")
| (as.character( diag_data$diag_3) >= "280" & as.character( diag_data$diag_3) <= "289") | (as.character( diag_data$diag_3) >= "320" & as.character( diag_data$diag_3) <= "359")
| (as.character( diag_data$diag_3) >= "630" & as.character( diag_data$diag_3) <= "679") | (as.character( diag_data$diag_3) >= "360" & as.character( diag_data$diag_3) <= "389")
| (as.character( diag_data$diag_3) >= "740" & as.character( diag_data$diag_3) <= "759")
| (startsWith(as.character( diag_data$diag_3), 'E'))
| (startsWith(as.character( diag_data$diag_3), 'V'))] <- 1
#merge bucketed variables diag_1,2&3 with dataset
data <- diag_data
#check for bucketing and merging
str(data)
## 'data.frame': 289 obs. of 72 variables:
## $ race : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
## $ age : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
## $ admission_type_id : Factor w/ 4 levels "1","2","3","6": 4 4 4 4 4 4 4 4 4 4 ...
## $ discharge_disposition_id : Factor w/ 9 levels "1","2","3","5",..: 3 1 1 5 1 1 1 1 1 7 ...
## $ admission_source_id : Factor w/ 3 levels "1","2","7": 3 3 3 3 2 3 3 3 3 1 ...
## $ time_in_hospital : int 5 10 2 11 14 7 2 3 2 4 ...
## $ num_lab_procedures : int 47 72 61 71 43 105 66 76 43 41 ...
## $ num_procedures : int 1 1 0 1 0 3 0 0 0 1 ...
## $ num_medications : int 6 19 5 20 11 16 3 9 13 8 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 90 levels "112","162","188",..: 24 5 19 85 22 36 6 21 71 74 ...
## $ diag_2 : Factor w/ 103 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
## $ diag_3 : Factor w/ 103 levels "198","208","211",..: 46 28 9 99 65 21 19 31 92 6 ...
## $ number_diagnoses : int 5 5 5 5 3 5 3 5 5 3 ...
## $ max_glu_serum : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
## $ A1Cresult : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
## $ readmitted : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 2 2 2 2 ...
## $ race_AfricanAmerican : num 0 1 0 0 0 0 0 0 0 0 ...
## $ race_Asian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ race_Caucasian : num 1 0 1 0 1 1 0 1 1 1 ...
## $ race_Hispanic : num 0 0 0 0 0 0 1 0 0 0 ...
## $ race_Other : num 0 0 0 1 0 0 0 0 0 0 ...
## $ gender.Female : num 0 1 1 0 1 0 1 1 1 0 ...
## $ gender.Male : num 1 0 0 1 0 1 0 0 0 1 ...
## $ age_10_20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_20_30 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_30_40 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ age_40_50 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ age_50_60 : num 0 0 1 0 0 0 1 0 1 1 ...
## $ age_60_70 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_70_80 : num 0 1 0 1 0 0 0 0 0 0 ...
## $ age_80_90 : num 1 0 0 0 0 1 0 0 0 0 ...
## $ admission_type_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.6 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ discharge_disposition_id.1 : num 0 1 1 0 1 1 1 1 1 0 ...
## $ discharge_disposition_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.3 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.6 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ discharge_disposition_id.7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.10: num 0 0 0 0 0 0 0 0 0 1 ...
## $ discharge_disposition_id.11: num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.13: num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id.1 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ admission_source_id.2 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ admission_source_id.7 : num 1 1 1 1 0 1 1 1 1 0 ...
## $ max_glu_serum_200 : num 1 0 0 1 0 0 0 0 0 1 ...
## $ max_glu_serum_300 : num 0 1 1 0 0 1 0 1 1 0 ...
## $ max_glu_serum_Norm : num 0 0 0 0 1 0 1 0 0 0 ...
## $ A1Cresult_7 : num 0 0 0 1 1 1 1 1 1 0 ...
## $ A1Cresult_8 : num 0 1 1 0 0 0 0 0 0 1 ...
## $ A1Cresult_Norm : num 1 0 0 0 0 0 0 0 0 0 ...
## $ med_change_yes : num 0 1 0 0 0 0 0 1 0 0 ...
## $ med_change_no : num 1 0 1 1 1 1 1 0 1 1 ...
## $ diabetes_med_yes : num 1 0 0 0 1 0 0 0 1 0 ...
## $ diabetes_med_no : num 0 1 1 1 0 1 1 1 0 1 ...
## $ Circulatory_diag : num 1 0 0 0 0 1 1 0 0 0 ...
## $ Respiraoty_diag : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Digestive_diag : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Diabetes_diag : num 0 1 1 1 1 1 1 0 1 1 ...
## $ Injury_diag : num 0 0 0 1 0 0 0 0 1 0 ...
## $ Musktl_diag : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Genitourinary_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Neoplasm_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_diag : num 1 1 1 1 1 1 1 1 1 0 ...
cbind(colnames(data))
## [,1]
## [1,] "race"
## [2,] "gender"
## [3,] "age"
## [4,] "admission_type_id"
## [5,] "discharge_disposition_id"
## [6,] "admission_source_id"
## [7,] "time_in_hospital"
## [8,] "num_lab_procedures"
## [9,] "num_procedures"
## [10,] "num_medications"
## [11,] "number_outpatient"
## [12,] "number_emergency"
## [13,] "number_inpatient"
## [14,] "diag_1"
## [15,] "diag_2"
## [16,] "diag_3"
## [17,] "number_diagnoses"
## [18,] "max_glu_serum"
## [19,] "A1Cresult"
## [20,] "change"
## [21,] "diabetesMed"
## [22,] "readmitted"
## [23,] "race_AfricanAmerican"
## [24,] "race_Asian"
## [25,] "race_Caucasian"
## [26,] "race_Hispanic"
## [27,] "race_Other"
## [28,] "gender.Female"
## [29,] "gender.Male"
## [30,] "age_10_20 "
## [31,] "age_20_30"
## [32,] "age_30_40"
## [33,] "age_40_50"
## [34,] "age_50_60"
## [35,] "age_60_70"
## [36,] "age_70_80"
## [37,] "age_80_90"
## [38,] "admission_type_id.1"
## [39,] "admission_type_id.2"
## [40,] "admission_type_id.3"
## [41,] "admission_type_id.6"
## [42,] "discharge_disposition_id.1"
## [43,] "discharge_disposition_id.2"
## [44,] "discharge_disposition_id.3"
## [45,] "discharge_disposition_id.5"
## [46,] "discharge_disposition_id.6"
## [47,] "discharge_disposition_id.7"
## [48,] "discharge_disposition_id.10"
## [49,] "discharge_disposition_id.11"
## [50,] "discharge_disposition_id.13"
## [51,] "admission_source_id.1"
## [52,] "admission_source_id.2"
## [53,] "admission_source_id.7"
## [54,] "max_glu_serum_200"
## [55,] "max_glu_serum_300"
## [56,] "max_glu_serum_Norm"
## [57,] "A1Cresult_7"
## [58,] "A1Cresult_8"
## [59,] "A1Cresult_Norm"
## [60,] "med_change_yes"
## [61,] "med_change_no"
## [62,] "diabetes_med_yes"
## [63,] "diabetes_med_no"
## [64,] "Circulatory_diag"
## [65,] "Respiraoty_diag"
## [66,] "Digestive_diag"
## [67,] "Diabetes_diag"
## [68,] "Injury_diag"
## [69,] "Musktl_diag"
## [70,] "Genitourinary_diag"
## [71,] "Neoplasm_diag"
## [72,] "Other_diag"
#reorganize column positions: place response variable as last column.
data = data[, c(1:21, 23:72, 22)]
Variables for which dummies have been created will be removed from dataset after EDA
Splitting Data
set.seed(123)
split <- sample.split(data$readmitted, SplitRatio = 0.80)
train_set <- subset(data, split == TRUE)
test_set <- subset(data, split == FALSE)
EDA on train data set.
#summary
summary(train_set)
## race gender age admission_type_id
## AfricanAmerican: 33 Female:133 [70-80):50 1: 42
## Asian : 5 Male : 98 [50-60):47 2: 4
## Caucasian :153 [60-70):47 3: 2
## Hispanic : 31 [80-90):35 6:183
## Other : 9 [40-50):32
## [30-40):13
## (Other): 7
## discharge_disposition_id admission_source_id time_in_hospital
## 1 :149 1: 21 Min. : 1.000
## 3 : 27 2: 2 1st Qu.: 3.000
## 6 : 24 7:208 Median : 5.000
## 2 : 15 Mean : 5.459
## 7 : 7 3rd Qu.: 7.000
## 5 : 5 Max. :14.000
## (Other): 4
## num_lab_procedures num_procedures num_medications number_outpatient
## Min. : 31.00 Min. :0.0000 Min. : 1.00 Min. :0.0000
## 1st Qu.: 54.00 1st Qu.:0.0000 1st Qu.: 9.00 1st Qu.:0.0000
## Median : 63.00 Median :0.0000 Median :14.00 Median :0.0000
## Mean : 64.19 Mean :0.8528 Mean :14.68 Mean :0.1991
## 3rd Qu.: 74.00 3rd Qu.:1.5000 3rd Qu.:19.00 3rd Qu.:0.0000
## Max. :106.00 Max. :6.0000 Max. :35.00 Max. :6.0000
##
## number_emergency number_inpatient diag_1 diag_2 diag_3
## Min. :0.0000 Min. :0.0000 491 : 17 250 : 19 250 : 18
## 1st Qu.:0.0000 1st Qu.:0.0000 428 : 16 250.02 : 14 401 : 16
## Median :0.0000 Median :0.0000 682 : 13 276 : 12 276 : 12
## Mean :0.1905 Mean :0.6667 414 : 12 599 : 10 250.02 : 11
## 3rd Qu.:0.0000 3rd Qu.:1.0000 786 : 10 411 : 9 414 : 11
## Max. :9.0000 Max. :7.0000 250.02 : 8 707 : 9 428 : 7
## (Other):155 (Other):158 (Other):156
## number_diagnoses max_glu_serum A1Cresult change diabetesMed
## Min. :3.000 >200:56 >7 : 54 Ch: 64 No : 99
## 1st Qu.:5.000 >300:96 >8 :135 No:167 Yes:132
## Median :6.000 Norm:79 Norm: 42
## Mean :5.978
## 3rd Qu.:6.000
## Max. :9.000
##
## race_AfricanAmerican race_Asian race_Caucasian race_Hispanic
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :1.0000 Median :0.0000
## Mean :0.1429 Mean :0.02165 Mean :0.6623 Mean :0.1342
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## race_Other gender.Female gender.Male age_10_20
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :1.0000 Median :0.0000 Median :0.00000
## Mean :0.03896 Mean :0.5758 Mean :0.4242 Mean :0.01732
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## age_20_30 age_30_40 age_40_50 age_50_60
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.01299 Mean :0.05628 Mean :0.1385 Mean :0.2035
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## age_60_70 age_70_80 age_80_90 admission_type_id.1
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2035 Mean :0.2165 Mean :0.1515 Mean :0.1818
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## admission_type_id.2 admission_type_id.3 admission_type_id.6
## Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:1.0000
## Median :0.00000 Median :0.000000 Median :1.0000
## Mean :0.01732 Mean :0.008658 Mean :0.7922
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.000000 Max. :1.0000
##
## discharge_disposition_id.1 discharge_disposition_id.2
## Min. :0.000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.00000
## Median :1.000 Median :0.00000
## Mean :0.645 Mean :0.06494
## 3rd Qu.:1.000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.00000
##
## discharge_disposition_id.3 discharge_disposition_id.5
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000
## Mean :0.1169 Mean :0.02165
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
##
## discharge_disposition_id.6 discharge_disposition_id.7
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.1039 Mean :0.0303
## 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
##
## discharge_disposition_id.10 discharge_disposition_id.11
## Min. :0 Min. :0.00000
## 1st Qu.:0 1st Qu.:0.00000
## Median :0 Median :0.00000
## Mean :0 Mean :0.01299
## 3rd Qu.:0 3rd Qu.:0.00000
## Max. :0 Max. :1.00000
##
## discharge_disposition_id.13 admission_source_id.1 admission_source_id.2
## Min. :0.000000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :0.000000 Median :0.00000 Median :0.000000
## Mean :0.004329 Mean :0.09091 Mean :0.008658
## 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :1.000000 Max. :1.00000 Max. :1.000000
##
## admission_source_id.7 max_glu_serum_200 max_glu_serum_300 max_glu_serum_Norm
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.000
## Mean :0.9004 Mean :0.2424 Mean :0.4156 Mean :0.342
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000
##
## A1Cresult_7 A1Cresult_8 A1Cresult_Norm med_change_yes
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000
## Mean :0.2338 Mean :0.5844 Mean :0.1818 Mean :0.2771
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## med_change_no diabetes_med_yes diabetes_med_no Circulatory_diag
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :0.7229 Mean :0.4286 Mean :0.5714 Mean :0.5411
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## Respiraoty_diag Digestive_diag Diabetes_diag Injury_diag
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :0.00000
## Mean :0.3074 Mean :0.1212 Mean :0.6277 Mean :0.07359
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Musktl_diag Genitourinary_diag Neoplasm_diag Other_diag
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.04329 Mean :0.1602 Mean :0.03896 Mean :0.5541
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## readmitted
## No : 93
## Yes:138
##
##
##
##
##
Distribution of response variable
readmit_table <- table(train_set$readmitted)
readmit_percent <- round(readmit_table/sum(readmit_table) *100)
labels1 <- paste(names(readmit_table), readmit_percent)
labels2 <- paste(labels1, "%", sep = "")
pie(readmit_table, labels = labels2, col = c("#032149", "#eabeb4"), main = "Distribution of Readmission Variable" )
Distributions of all variables
for(each_col in 1:ncol(train_set)){
barplot(table(train_set[[each_col]]), main = names(train_set)[each_col], xlab = "", ylab = "Frequency", col = "#7289da")
}
#Relationship b/w Length of stay and readmission
readmit_time <- ggplot(train_set, aes(readmitted)) +
geom_bar(aes(time_in_hospital, fill = factor(readmitted)), position = 'stack')+
scale_fill_manual(values = c("#032149", "#eabeb4"))+
scale_color_manual(values = c("#032149", "#eabeb4" ))+
ylab("Readmission")+
xlab("Time in Hospital(days)")+
ggtitle("Time Spent in Hospital Vs Readmission")
readmit_time
#Relationship b/w if a patient is on diabetes medication and readmission
readmit_meds <- ggplot(train_set, aes(readmitted)) +
geom_bar(aes(diabetesMed, fill = factor(readmitted)), position = "dodge")+
scale_fill_manual(values = c("#032149", "#eabeb4"))+
scale_color_manual(values = c("#032149", "#eabeb4" ))+
ylab("Readmission")+
xlab("Diabetes Medication")+
ggtitle("Relationship b/w being on a diabetes medication(s) and readmission")
readmit_meds
#Frequency of readmission by age
age_readmit <- ggplot(train_set, aes(readmitted)) +
geom_bar(aes(readmitted, fill = age), position = 'dodge')+
xlab("Readmission")+
ylab("Frequnecy")+
ggtitle("Freq. of Readmission by Age")
age_readmit
#Frequency of readmission by race
race_readmit <- ggplot(train_set, aes(readmitted)) +
geom_bar(aes(readmitted, fill = race), position = 'dodge')+
xlab("Readmission")+
ylab("Frequnecy")+
ggtitle("Freq. of Readmission by Race")
race_readmit
#Frequency of readmit by gender
gender_readmit <- ggplot(train_set, aes(readmitted)) +
geom_bar(aes(readmitted, fill = gender), position = 'dodge')+
scale_fill_manual(values = c("#032149", "#eabeb4"))+
scale_color_manual(values = c("#032149", "#eabeb4"))+
xlab("Readmission")+
ylab("Frequnecy")+
ggtitle("Freq. of Readmission by Gender")
gender_readmit
#Spread of Readmission by select variables
boxplot_variables <- names(train_set %>% select(c(time_in_hospital, num_lab_procedures, num_procedures, num_medications, number_outpatient, number_inpatient,number_diagnoses)))
for (each_var in boxplot_variables) {
box_data <- data.frame(x = train_set$readmitted, y = train_set[[each_var]])
box_plot <-ggplot(box_data, aes(x= x, y = y, color = x, fill = x))+
geom_boxplot() +
geom_quasirandom(alpha = 0.3)+
xlab("Readmitted") +
ylab(each_var)+
ggtitle(paste("Spread of", each_var, "by Readmission"))+
scale_fill_manual(values = c("#032149", "#eabeb4"))+
scale_color_manual(values = c("#032149", "#eabeb4"))
print(box_plot)
}
Removing variables for which dummies have been created from train and test dataset
train_set <- train_set %>% select(-race, -gender, -age, -admission_type_id, -discharge_disposition_id, -admission_source_id, -max_glu_serum, -A1Cresult, -change, -diabetesMed, -diag_1, -diag_2, -diag_3)
test_set <- test_set %>% select(-race, -gender, -age, -admission_type_id, -discharge_disposition_id, -admission_source_id, -max_glu_serum, -A1Cresult, -change, -diabetesMed, -diag_1, -diag_2, -diag_3)
str(train_set)
## 'data.frame': 231 obs. of 59 variables:
## $ time_in_hospital : int 5 2 11 14 3 2 13 3 4 4 ...
## $ num_lab_procedures : int 47 61 71 43 76 43 58 43 52 64 ...
## $ num_procedures : int 1 0 1 0 0 0 3 0 0 0 ...
## $ num_medications : int 6 5 20 11 9 13 17 3 8 3 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_diagnoses : int 5 5 5 3 5 5 5 5 3 4 ...
## $ race_AfricanAmerican : num 0 0 0 0 0 0 0 1 0 0 ...
## $ race_Asian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ race_Caucasian : num 1 1 0 1 1 1 1 0 0 1 ...
## $ race_Hispanic : num 0 0 0 0 0 0 0 0 1 0 ...
## $ race_Other : num 0 0 1 0 0 0 0 0 0 0 ...
## $ gender.Female : num 0 1 0 1 1 1 0 1 1 0 ...
## $ gender.Male : num 1 0 1 0 0 0 1 0 0 1 ...
## $ age_10_20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_20_30 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_30_40 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ age_40_50 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ age_50_60 : num 0 1 0 0 0 1 0 0 1 0 ...
## $ age_60_70 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_70_80 : num 0 0 1 0 0 0 1 1 0 1 ...
## $ age_80_90 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.6 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ discharge_disposition_id.1 : num 0 1 0 1 1 1 0 1 1 0 ...
## $ discharge_disposition_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.3 : num 1 0 0 0 0 0 1 0 0 1 ...
## $ discharge_disposition_id.5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.6 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.10: num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.11: num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.13: num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id.2 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ admission_source_id.7 : num 1 1 1 0 1 1 1 1 1 1 ...
## $ max_glu_serum_200 : num 1 0 1 0 0 0 1 1 0 0 ...
## $ max_glu_serum_300 : num 0 1 0 0 1 1 0 0 1 0 ...
## $ max_glu_serum_Norm : num 0 0 0 1 0 0 0 0 0 1 ...
## $ A1Cresult_7 : num 0 0 1 1 1 1 0 1 0 0 ...
## $ A1Cresult_8 : num 0 1 0 0 0 0 1 0 1 0 ...
## $ A1Cresult_Norm : num 1 0 0 0 0 0 0 0 0 1 ...
## $ med_change_yes : num 0 0 0 0 1 0 0 0 0 0 ...
## $ med_change_no : num 1 1 1 1 0 1 1 1 1 1 ...
## $ diabetes_med_yes : num 1 0 0 1 0 1 0 1 0 0 ...
## $ diabetes_med_no : num 0 1 1 0 1 0 1 0 1 1 ...
## $ Circulatory_diag : num 1 0 0 0 0 0 1 1 1 0 ...
## $ Respiraoty_diag : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Digestive_diag : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Diabetes_diag : num 0 1 1 1 0 1 0 1 1 1 ...
## $ Injury_diag : num 0 0 1 0 0 1 0 0 0 0 ...
## $ Musktl_diag : num 0 0 0 0 0 0 1 0 0 1 ...
## $ Genitourinary_diag : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Neoplasm_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_diag : num 1 1 1 1 1 1 0 0 0 1 ...
## $ readmitted : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...
str(test_set)
## 'data.frame': 58 obs. of 59 variables:
## $ time_in_hospital : int 10 7 2 4 6 5 9 5 10 5 ...
## $ num_lab_procedures : int 72 105 66 41 69 95 53 45 81 98 ...
## $ num_procedures : int 1 3 0 1 0 2 1 0 0 0 ...
## $ num_medications : int 19 16 3 8 17 11 15 11 19 15 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 1 0 1 0 0 0 ...
## $ number_diagnoses : int 5 5 3 3 5 5 5 5 5 5 ...
## $ race_AfricanAmerican : num 1 0 0 0 0 1 0 0 0 1 ...
## $ race_Asian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ race_Caucasian : num 0 1 0 1 0 0 1 1 0 0 ...
## $ race_Hispanic : num 0 0 1 0 1 0 0 0 0 0 ...
## $ race_Other : num 0 0 0 0 0 0 0 0 1 0 ...
## $ gender.Female : num 1 0 1 0 1 1 0 0 0 1 ...
## $ gender.Male : num 0 1 0 1 0 0 1 1 1 0 ...
## $ age_10_20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_20_30 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_30_40 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_40_50 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_50_60 : num 0 0 1 1 0 1 0 0 0 1 ...
## $ age_60_70 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ age_70_80 : num 1 0 0 0 1 0 1 1 0 0 ...
## $ age_80_90 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.6 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ discharge_disposition_id.1 : num 1 1 1 0 0 1 1 0 0 1 ...
## $ discharge_disposition_id.2 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ discharge_disposition_id.3 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ discharge_disposition_id.5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.6 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ discharge_disposition_id.7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.10: num 0 0 0 1 0 0 0 0 0 0 ...
## $ discharge_disposition_id.11: num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.13: num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id.1 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ admission_source_id.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id.7 : num 1 1 1 0 1 1 1 1 1 1 ...
## $ max_glu_serum_200 : num 0 0 0 1 1 0 0 1 0 0 ...
## $ max_glu_serum_300 : num 1 1 0 0 0 0 0 0 0 1 ...
## $ max_glu_serum_Norm : num 0 0 1 0 0 1 1 0 1 0 ...
## $ A1Cresult_7 : num 0 1 1 0 0 0 0 0 0 1 ...
## $ A1Cresult_8 : num 1 0 0 1 0 1 1 0 1 0 ...
## $ A1Cresult_Norm : num 0 0 0 0 1 0 0 1 0 0 ...
## $ med_change_yes : num 1 0 0 0 0 0 1 0 0 0 ...
## $ med_change_no : num 0 1 1 1 1 1 0 1 1 1 ...
## $ diabetes_med_yes : num 0 0 0 0 1 0 0 0 0 0 ...
## $ diabetes_med_no : num 1 1 1 1 0 1 1 1 1 1 ...
## $ Circulatory_diag : num 0 1 1 0 1 1 1 1 1 0 ...
## $ Respiraoty_diag : num 0 0 0 1 0 0 0 0 1 0 ...
## $ Digestive_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Diabetes_diag : num 1 1 1 1 0 1 0 0 0 0 ...
## $ Injury_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Musktl_diag : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Genitourinary_diag : num 0 0 0 0 0 1 0 0 0 1 ...
## $ Neoplasm_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Other_diag : num 1 1 1 0 0 0 0 0 0 1 ...
## $ readmitted : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
Correlation
#Create train correlation data
#only numeric variables excluding dummies
train_cor_data <- train_set[, c("time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_diagnoses", "number_outpatient", "number_inpatient")]
str(train_cor_data)
## 'data.frame': 231 obs. of 7 variables:
## $ time_in_hospital : int 5 2 11 14 3 2 13 3 4 4 ...
## $ num_lab_procedures: int 47 61 71 43 76 43 58 43 52 64 ...
## $ num_procedures : int 1 0 1 0 0 0 3 0 0 0 ...
## $ num_medications : int 6 5 20 11 9 13 17 3 8 3 ...
## $ number_diagnoses : int 5 5 5 3 5 5 5 5 3 4 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
#correlation matrix
train_cor <- cor(train_cor_data, use = "pairwise.complete.obs", method = "pearson")
train_cor
## time_in_hospital num_lab_procedures num_procedures
## time_in_hospital 1.000000000 0.27999268 0.23844011
## num_lab_procedures 0.279992675 1.00000000 0.21962159
## num_procedures 0.238440109 0.21962159 1.00000000
## num_medications 0.425852272 0.32974079 0.34204181
## number_diagnoses 0.129669950 0.17004561 0.20551003
## number_outpatient 0.009321861 0.09255762 0.08655823
## number_inpatient 0.073733675 -0.04523063 -0.02002753
## num_medications number_diagnoses number_outpatient
## time_in_hospital 0.4258523 0.1296700 0.009321861
## num_lab_procedures 0.3297408 0.1700456 0.092557624
## num_procedures 0.3420418 0.2055100 0.086558231
## num_medications 1.0000000 0.4868194 0.246360104
## number_diagnoses 0.4868194 1.0000000 0.480487005
## number_outpatient 0.2463601 0.4804870 1.000000000
## number_inpatient 0.2546947 0.2853367 0.272736656
## number_inpatient
## time_in_hospital 0.07373367
## num_lab_procedures -0.04523063
## num_procedures -0.02002753
## num_medications 0.25469465
## number_diagnoses 0.28533668
## number_outpatient 0.27273666
## number_inpatient 1.00000000
#correlation plot
train_cor_plot <- corrplot(train_cor, method = "number", order = "alphabet", tl.cex = 0.8, tl.srt = 45, col = c("#092c86","#666666", "#f3bbc1"))
#ggpair plot of select variables
ggpair_data <- cbind(train_cor_data, train_set$readmitted)
ggpairs(ggpair_data, aes(color = train_set$readmitted, alpha = 0.4))+ theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Handle Zero Variance variables
#Determine zero variance variables - train set
train_zvar <- colnames(train_set)[nearZeroVar(train_set)]
train_zvar
## [1] "number_emergency" "race_Asian"
## [3] "race_Other" "age_10_20 "
## [5] "age_20_30" "admission_type_id.2"
## [7] "admission_type_id.3" "discharge_disposition_id.5"
## [9] "discharge_disposition_id.7" "discharge_disposition_id.10"
## [11] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [13] "admission_source_id.2" "Musktl_diag"
## [15] "Neoplasm_diag"
#Remove zero variance variables - train set
train_variance <- train_set %>% select(-all_of(train_zvar))
str(train_variance)
## 'data.frame': 231 obs. of 44 variables:
## $ time_in_hospital : int 5 2 11 14 3 2 13 3 4 4 ...
## $ num_lab_procedures : int 47 61 71 43 76 43 58 43 52 64 ...
## $ num_procedures : int 1 0 1 0 0 0 3 0 0 0 ...
## $ num_medications : int 6 5 20 11 9 13 17 3 8 3 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_diagnoses : int 5 5 5 3 5 5 5 5 3 4 ...
## $ race_AfricanAmerican : num 0 0 0 0 0 0 0 1 0 0 ...
## $ race_Caucasian : num 1 1 0 1 1 1 1 0 0 1 ...
## $ race_Hispanic : num 0 0 0 0 0 0 0 0 1 0 ...
## $ gender.Female : num 0 1 0 1 1 1 0 1 1 0 ...
## $ gender.Male : num 1 0 1 0 0 0 1 0 0 1 ...
## $ age_30_40 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ age_40_50 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ age_50_60 : num 0 1 0 0 0 1 0 0 1 0 ...
## $ age_60_70 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_70_80 : num 0 0 1 0 0 0 1 1 0 1 ...
## $ age_80_90 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.6 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ discharge_disposition_id.1: num 0 1 0 1 1 1 0 1 1 0 ...
## $ discharge_disposition_id.2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id.3: num 1 0 0 0 0 0 1 0 0 1 ...
## $ discharge_disposition_id.6: num 0 0 1 0 0 0 0 0 0 0 ...
## $ admission_source_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id.7 : num 1 1 1 0 1 1 1 1 1 1 ...
## $ max_glu_serum_200 : num 1 0 1 0 0 0 1 1 0 0 ...
## $ max_glu_serum_300 : num 0 1 0 0 1 1 0 0 1 0 ...
## $ max_glu_serum_Norm : num 0 0 0 1 0 0 0 0 0 1 ...
## $ A1Cresult_7 : num 0 0 1 1 1 1 0 1 0 0 ...
## $ A1Cresult_8 : num 0 1 0 0 0 0 1 0 1 0 ...
## $ A1Cresult_Norm : num 1 0 0 0 0 0 0 0 0 1 ...
## $ med_change_yes : num 0 0 0 0 1 0 0 0 0 0 ...
## $ med_change_no : num 1 1 1 1 0 1 1 1 1 1 ...
## $ diabetes_med_yes : num 1 0 0 1 0 1 0 1 0 0 ...
## $ diabetes_med_no : num 0 1 1 0 1 0 1 0 1 1 ...
## $ Circulatory_diag : num 1 0 0 0 0 0 1 1 1 0 ...
## $ Respiraoty_diag : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Digestive_diag : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Diabetes_diag : num 0 1 1 1 0 1 0 1 1 1 ...
## $ Injury_diag : num 0 0 1 0 0 1 0 0 0 0 ...
## $ Genitourinary_diag : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Other_diag : num 1 1 1 1 1 1 0 0 0 1 ...
## $ readmitted : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...
#Remove zero variance variables - test set
test_variance <- test_set %>% select(-all_of(train_zvar))
str(test_variance)
## 'data.frame': 58 obs. of 44 variables:
## $ time_in_hospital : int 10 7 2 4 6 5 9 5 10 5 ...
## $ num_lab_procedures : int 72 105 66 41 69 95 53 45 81 98 ...
## $ num_procedures : int 1 3 0 1 0 2 1 0 0 0 ...
## $ num_medications : int 19 16 3 8 17 11 15 11 19 15 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 1 0 1 0 0 0 ...
## $ number_diagnoses : int 5 5 3 3 5 5 5 5 5 5 ...
## $ race_AfricanAmerican : num 1 0 0 0 0 1 0 0 0 1 ...
## $ race_Caucasian : num 0 1 0 1 0 0 1 1 0 0 ...
## $ race_Hispanic : num 0 0 1 0 1 0 0 0 0 0 ...
## $ gender.Female : num 1 0 1 0 1 1 0 0 0 1 ...
## $ gender.Male : num 0 1 0 1 0 0 1 1 1 0 ...
## $ age_30_40 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_40_50 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age_50_60 : num 0 0 1 1 0 1 0 0 0 1 ...
## $ age_60_70 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ age_70_80 : num 1 0 0 0 1 0 1 1 0 0 ...
## $ age_80_90 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_type_id.6 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ discharge_disposition_id.1: num 1 1 1 0 0 1 1 0 0 1 ...
## $ discharge_disposition_id.2: num 0 0 0 0 0 0 0 1 0 0 ...
## $ discharge_disposition_id.3: num 0 0 0 0 0 0 0 0 1 0 ...
## $ discharge_disposition_id.6: num 0 0 0 0 1 0 0 0 0 0 ...
## $ admission_source_id.1 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ admission_source_id.7 : num 1 1 1 0 1 1 1 1 1 1 ...
## $ max_glu_serum_200 : num 0 0 0 1 1 0 0 1 0 0 ...
## $ max_glu_serum_300 : num 1 1 0 0 0 0 0 0 0 1 ...
## $ max_glu_serum_Norm : num 0 0 1 0 0 1 1 0 1 0 ...
## $ A1Cresult_7 : num 0 1 1 0 0 0 0 0 0 1 ...
## $ A1Cresult_8 : num 1 0 0 1 0 1 1 0 1 0 ...
## $ A1Cresult_Norm : num 0 0 0 0 1 0 0 1 0 0 ...
## $ med_change_yes : num 1 0 0 0 0 0 1 0 0 0 ...
## $ med_change_no : num 0 1 1 1 1 1 0 1 1 1 ...
## $ diabetes_med_yes : num 0 0 0 0 1 0 0 0 0 0 ...
## $ diabetes_med_no : num 1 1 1 1 0 1 1 1 1 1 ...
## $ Circulatory_diag : num 0 1 1 0 1 1 1 1 1 0 ...
## $ Respiraoty_diag : num 0 0 0 1 0 0 0 0 1 0 ...
## $ Digestive_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Diabetes_diag : num 1 1 1 1 0 1 0 0 0 0 ...
## $ Injury_diag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Genitourinary_diag : num 0 0 0 0 0 1 0 0 0 1 ...
## $ Other_diag : num 1 1 1 0 0 0 0 0 0 1 ...
## $ readmitted : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
Scaling Data
#Scale Train dataset and using Preprocessed train data to scale test_set
prep_train_set <- preProcess(train_variance %>% select(-readmitted), method = c("center", "scale"))
train_set <- predict(prep_train_set, train_variance)
test_set <- predict(prep_train_set, test_variance)
#Check variance of each column
variance <- apply(train_variance, 2, var)
variance
## time_in_hospital num_lab_procedures
## 9.29286655 203.76085074
## num_procedures num_medications
## 1.58693770 56.20282326
## number_outpatient number_inpatient
## 0.56017316 1.59710145
## number_diagnoses race_AfricanAmerican
## 2.24735554 0.12298137
## race_Caucasian race_Hispanic
## 0.22461886 0.11669490
## gender.Female gender.Male
## 0.24532279 0.24532279
## age_30_40 age_40_50
## 0.05334086 0.11985695
## age_50_60 age_60_70
## 0.16277056 0.16277056
## age_70_80 age_80_90
## 0.17033691 0.12911726
## admission_type_id.1 admission_type_id.6
## 0.14940711 0.16533032
## discharge_disposition_id.1 discharge_disposition_id.2
## 0.22996424 0.06098250
## discharge_disposition_id.3 discharge_disposition_id.6
## 0.10367024 0.09350649
## admission_source_id.1 admission_source_id.7
## 0.08300395 0.09004329
## max_glu_serum_200 max_glu_serum_300
## 0.18445323 0.24392998
## max_glu_serum_Norm A1Cresult_7
## 0.22601167 0.17989836
## A1Cresult_8 A1Cresult_Norm
## 0.24392998 0.14940711
## med_change_yes med_change_no
## 0.20116695 0.20116695
## diabetes_med_yes diabetes_med_no
## 0.24596273 0.24596273
## Circulatory_diag Respiraoty_diag
## 0.24938829 0.21381517
## Digestive_diag Diabetes_diag
## 0.10698287 0.23470732
## Injury_diag Genitourinary_diag
## 0.06847356 0.13510258
## Other_diag readmitted
## 0.24814606 NA
Dimensionality Reduction
PCA
pca_num_train <- train_set[, c("time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_diagnoses", "number_outpatient", "number_inpatient")]
pca_model <- prcomp(pca_num_train,
center = T,
scale = T)
pca_model
## Standard deviations (1, .., p=7):
## [1] 1.5499985 1.1758734 0.9378776 0.8855213 0.7877915 0.7390788 0.6198477
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4 PC5
## time_in_hospital 0.3360624 -0.4173462 0.5243925 -0.0412094105 0.48866905
## num_lab_procedures 0.3124108 -0.3999764 -0.2245250 -0.6699986370 -0.47725077
## num_procedures 0.3208567 -0.3401964 -0.3342444 0.7189787806 -0.30569941
## num_medications 0.5255492 -0.1069897 0.1629984 0.0484577298 0.07532914
## number_diagnoses 0.4705238 0.3050575 -0.2289782 0.0009183564 0.21949363
## number_outpatient 0.3470551 0.4689049 -0.3748550 -0.1550765841 0.25472725
## number_inpatient 0.2616690 0.4751092 0.5859743 0.0779447376 -0.56678411
## PC6 PC7
## time_in_hospital -0.35635406 -0.26535406
## num_lab_procedures -0.06652207 -0.10447579
## num_procedures -0.22597967 -0.09026985
## num_medications 0.44066294 0.69539652
## number_diagnoses 0.48855922 -0.58842833
## number_outpatient -0.60132700 0.26198432
## number_inpatient -0.15182785 -0.10983339
#Obtain eigenvalues
get_eigenvalue(pca_model)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.4024952 34.321361 34.32136
## Dim.2 1.3826783 19.752547 54.07391
## Dim.3 0.8796144 12.565920 66.63983
## Dim.4 0.7841480 11.202114 77.84194
## Dim.5 0.6206155 8.865936 86.70788
## Dim.6 0.5462374 7.803392 94.51127
## Dim.7 0.3842112 5.488731 100.00000
#Plot eigenvalues
fviz_eig(pca_model)
#Create PCA train and test datasets
#pca data to be used for clustering and model performance evaluation. pcaComp = 5 covers ~87% of information from data.
pca = preProcess(x = train_variance %>% select(-readmitted), method = c("center", "scale", 'pca'), pcaComp = 5)
train_pca = predict(pca, train_set)
test_pca = predict(pca, test_set)
str(train_pca)
## 'data.frame': 231 obs. of 6 variables:
## $ readmitted: Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...
## $ PC1 : num -7.129 -0.761 -2.792 -5.812 -0.197 ...
## $ PC2 : num 0.187 -1.638 -1.497 3.896 -5.88 ...
## $ PC3 : num 3.86 -4.15 1.6 -4.26 -2.23 ...
## $ PC4 : num 1.66 4.11 3.63 10.45 3.41 ...
## $ PC5 : num 1.14 -3.05 -3.53 -1.4 -3.93 ...
str(test_pca)
## 'data.frame': 58 obs. of 6 variables:
## $ readmitted: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
## $ PC1 : num 1.2 -3.15 -6.9 -1.7 -5.9 ...
## $ PC2 : num -5.99 -2.91 -2.49 -2.98 1.15 ...
## $ PC3 : num -5.683 0.626 -3.637 -0.812 -0.578 ...
## $ PC4 : num 2.326 4.206 1.199 12.217 -0.728 ...
## $ PC5 : num -4.608 0.681 -3.918 -1.695 -4.187 ...
head(train_pca)
## readmitted PC1 PC2 PC3 PC4 PC5
## 163 Yes -7.1285149 0.1872657 3.861544 1.658994 1.139496
## 594 No -0.7613097 -1.6382055 -4.152343 4.114732 -3.054692
## 697 No -2.7919409 -1.4972945 1.604623 3.633316 -3.527514
## 772 Yes -5.8115645 3.8963328 -4.258017 10.450170 -1.403868
## 1281 Yes -0.1965211 -5.8801111 -2.226753 3.408606 -3.930573
## 1756 Yes -3.7727694 3.1367656 -2.632466 3.150225 -4.129599
head(test_pca)
## readmitted PC1 PC2 PC3 PC4 PC5
## 461 Yes 1.198539 -5.994778 -5.6833471 2.3261126 -4.6080128
## 824 Yes -3.149068 -2.913959 0.6261374 4.2059116 0.6811802
## 962 Yes -6.899306 -2.492718 -3.6369469 1.1987444 -3.9182227
## 1984 Yes -1.696995 -2.981183 -0.8120658 12.2168597 -1.6951108
## 2309 Yes -5.897093 1.151353 -0.5780209 -0.7283363 -4.1872207
## 5016 Yes -4.071932 -1.917166 -5.0786976 0.4321735 -2.7598897
Clustering
Hierarchical Clustering
#Compute Euclidean Distance between numerical observations in training dataset
hc_train = hclust(d = dist(train_variance %>% select(-readmitted), method = 'euclidean'), method = 'ward.D')
hc_train
##
## Call:
## hclust(d = dist(train_variance %>% select(-readmitted), method = "euclidean"), method = "ward.D")
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 231
#plot Dendrogram
plot(hc_train, main = "Dendrogram", xlab = "Observations", ylab = "Euclidean distances")
#categorize into three clusters (based on the dendrogram)
y_hc_train = cutree(hc_train, 3)
y_hc_train
## 163 594 697 772 1281 1756 2059 2080 2484 3277 4104
## 1 2 3 1 3 1 2 1 2 2 2
## 4857 4967 5149 5313 5388 5771 6953 8808 9119 10040 11979
## 3 2 3 2 3 2 2 3 3 2 3
## 12275 12838 13235 13365 13549 13634 14644 15583 16431 16920 17097
## 2 1 2 2 3 3 2 2 3 3 3
## 17710 17836 17839 22004 22438 23303 23465 24103 24810 25950 26277
## 2 1 1 2 2 3 2 3 2 2 2
## 26379 26453 26497 26826 26840 27472 27669 28370 28491 28772 28951
## 1 1 2 2 1 3 2 2 3 3 3
## 29058 29127 29135 29181 29256 29485 29487 29822 30885 31158 31165
## 1 1 3 2 3 2 2 3 1 2 1
## 31275 31319 31767 32134 32700 32829 32948 33020 33439 33946 35185
## 1 1 3 2 2 2 3 3 1 2 2
## 35443 35777 36745 36987 37015 37249 37402 37411 37656 37954 37987
## 2 1 1 3 1 2 2 1 3 2 3
## 38180 38566 39013 39068 39275 39356 39362 40311 40319 40338 40749
## 3 2 3 2 2 1 2 1 3 2 2
## 40816 40971 41134 41729 42248 42257 43185 43450 44040 44171 46386
## 2 2 2 1 2 2 2 2 2 3 1
## 48474 49293 50337 51185 52679 52765 53181 53261 53376 53547 53672
## 2 2 1 2 2 3 3 1 3 2 2
## 53687 54254 54342 55002 55120 55888 56068 57260 57410 57670 57806
## 1 3 3 2 2 2 1 2 1 3 1
## 57811 58769 59667 60148 63412 64163 65813 66253 66258 67102 67316
## 1 2 3 3 3 1 3 3 1 2 2
## 67406 67529 67616 67638 67716 67995 68230 68427 68516 68656 68770
## 3 2 2 3 2 2 1 1 3 1 2
## 69010 69118 69479 69621 69627 70127 70175 70357 70553 71045 71151
## 1 3 2 3 2 2 2 2 3 1 2
## 71309 71802 71940 73441 74273 74284 74831 75153 75404 75661 76113
## 1 2 3 3 1 2 1 1 2 3 2
## 76869 76954 77058 77281 77674 77817 79345 79526 79574 80203 80254
## 1 1 3 3 2 3 1 2 3 3 3
## 80411 80568 80810 81744 82474 82677 83262 83746 84340 84980 85153
## 1 1 1 2 2 2 3 3 2 1 3
## 85158 85205 85250 86386 86641 86942 87705 87731 89829 90542 90750
## 2 3 2 2 2 3 3 3 2 3 3
## 90824 91002 91881 91960 92591 93124 93323 93894 93938 94140 94519
## 3 2 1 3 2 1 3 3 3 3 1
## 94756 96459 97084 97207 97739 98208 99005 100387 100494 100579 101089
## 2 2 1 1 1 2 2 1 3 3 2
#plot clusters
clusplot(train_variance %>% select(-readmitted),
y_hc_train,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients (Hierarchical)")
K-means Clustering
k_mean = kmeans(x = train_variance %>% select(-readmitted), centers = 3)
y_means = k_mean$cluster
clusplot(train_variance %>% select(-readmitted),
y_means,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients (K-means)")
Optimum number of Clusters using Silhouette,Gap Statistic and Elbow Methods
#Silhouette method and k-means
fviz_nbclust(train_variance %>% select(-readmitted), kmeans, method = "silhouette") + labs(subtitle = "Silhoutte method")
#Silhouette method and hierarchical
fviz_nbclust(train_variance %>% select(-readmitted), hcut, method = "silhouette")+ labs(subtitle = "Silhouette method")
#Gap statistic method and kmeans
fviz_nbclust(train_variance %>% select(-readmitted), kmeans, nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method")
#Gap statistic method and hierarchical
fviz_nbclust(train_variance %>% select(-readmitted), hcut , nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method")
#Elbow method and hierarchical
fviz_nbclust(train_variance %>% select(-readmitted), hcut, method = "wss")+ labs(subbtitle = "Elbow Method")
#Elbow method and kmeans
fviz_nbclust(train_variance %>% select(-readmitted), kmeans, method = "wss")+ labs(subbtitle = "Elbow Method")
Cluster Plots based on outcome of cluster methods above*
#categorize into two clusters
y_hc_train_plot = cutree(hc_train, 2)
y_hc_train_plot
## 163 594 697 772 1281 1756 2059 2080 2484 3277 4104
## 1 1 2 1 2 1 1 1 1 1 1
## 4857 4967 5149 5313 5388 5771 6953 8808 9119 10040 11979
## 2 1 2 1 2 1 1 2 2 1 2
## 12275 12838 13235 13365 13549 13634 14644 15583 16431 16920 17097
## 1 1 1 1 2 2 1 1 2 2 2
## 17710 17836 17839 22004 22438 23303 23465 24103 24810 25950 26277
## 1 1 1 1 1 2 1 2 1 1 1
## 26379 26453 26497 26826 26840 27472 27669 28370 28491 28772 28951
## 1 1 1 1 1 2 1 1 2 2 2
## 29058 29127 29135 29181 29256 29485 29487 29822 30885 31158 31165
## 1 1 2 1 2 1 1 2 1 1 1
## 31275 31319 31767 32134 32700 32829 32948 33020 33439 33946 35185
## 1 1 2 1 1 1 2 2 1 1 1
## 35443 35777 36745 36987 37015 37249 37402 37411 37656 37954 37987
## 1 1 1 2 1 1 1 1 2 1 2
## 38180 38566 39013 39068 39275 39356 39362 40311 40319 40338 40749
## 2 1 2 1 1 1 1 1 2 1 1
## 40816 40971 41134 41729 42248 42257 43185 43450 44040 44171 46386
## 1 1 1 1 1 1 1 1 1 2 1
## 48474 49293 50337 51185 52679 52765 53181 53261 53376 53547 53672
## 1 1 1 1 1 2 2 1 2 1 1
## 53687 54254 54342 55002 55120 55888 56068 57260 57410 57670 57806
## 1 2 2 1 1 1 1 1 1 2 1
## 57811 58769 59667 60148 63412 64163 65813 66253 66258 67102 67316
## 1 1 2 2 2 1 2 2 1 1 1
## 67406 67529 67616 67638 67716 67995 68230 68427 68516 68656 68770
## 2 1 1 2 1 1 1 1 2 1 1
## 69010 69118 69479 69621 69627 70127 70175 70357 70553 71045 71151
## 1 2 1 2 1 1 1 1 2 1 1
## 71309 71802 71940 73441 74273 74284 74831 75153 75404 75661 76113
## 1 1 2 2 1 1 1 1 1 2 1
## 76869 76954 77058 77281 77674 77817 79345 79526 79574 80203 80254
## 1 1 2 2 1 2 1 1 2 2 2
## 80411 80568 80810 81744 82474 82677 83262 83746 84340 84980 85153
## 1 1 1 1 1 1 2 2 1 1 2
## 85158 85205 85250 86386 86641 86942 87705 87731 89829 90542 90750
## 1 2 1 1 1 2 2 2 1 2 2
## 90824 91002 91881 91960 92591 93124 93323 93894 93938 94140 94519
## 2 1 1 2 1 1 2 2 2 2 1
## 94756 96459 97084 97207 97739 98208 99005 100387 100494 100579 101089
## 1 1 1 1 1 1 1 1 2 2 1
#plot clusters
clusplot(train_variance %>% select(-readmitted),
y_hc_train_plot,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients(Hierarchical)-Based on outcomes of cluster methods")
K-means Clustering
k_mean_plot = kmeans(x = train_variance %>% select(-readmitted), centers = 2)
y_means_plot = k_mean_plot$cluster
clusplot(train_variance %>% select(-readmitted),
y_means,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients(K-means):Based on outcomes of cluster plot methods")
Hierarchical Clustering using PCA data
#Compute Euclidean Distance between numerical observations in training PCA dataset
hc_train_pca = hclust(d = dist(train_pca %>% select(-readmitted), method = 'euclidean'), method = 'ward.D')
hc_train_pca
##
## Call:
## hclust(d = dist(train_pca %>% select(-readmitted), method = "euclidean"), method = "ward.D")
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 231
#plot Dendrogram - PCA
plot(hc_train_pca, main = "Dendrogram", xlab = "Observations", ylab = "Euclidean distances")
#categorize into two clusters (based on the dendrogram) - PCA
y_hc_train_pca = cutree(hc_train, 3)
y_hc_train_pca
## 163 594 697 772 1281 1756 2059 2080 2484 3277 4104
## 1 2 3 1 3 1 2 1 2 2 2
## 4857 4967 5149 5313 5388 5771 6953 8808 9119 10040 11979
## 3 2 3 2 3 2 2 3 3 2 3
## 12275 12838 13235 13365 13549 13634 14644 15583 16431 16920 17097
## 2 1 2 2 3 3 2 2 3 3 3
## 17710 17836 17839 22004 22438 23303 23465 24103 24810 25950 26277
## 2 1 1 2 2 3 2 3 2 2 2
## 26379 26453 26497 26826 26840 27472 27669 28370 28491 28772 28951
## 1 1 2 2 1 3 2 2 3 3 3
## 29058 29127 29135 29181 29256 29485 29487 29822 30885 31158 31165
## 1 1 3 2 3 2 2 3 1 2 1
## 31275 31319 31767 32134 32700 32829 32948 33020 33439 33946 35185
## 1 1 3 2 2 2 3 3 1 2 2
## 35443 35777 36745 36987 37015 37249 37402 37411 37656 37954 37987
## 2 1 1 3 1 2 2 1 3 2 3
## 38180 38566 39013 39068 39275 39356 39362 40311 40319 40338 40749
## 3 2 3 2 2 1 2 1 3 2 2
## 40816 40971 41134 41729 42248 42257 43185 43450 44040 44171 46386
## 2 2 2 1 2 2 2 2 2 3 1
## 48474 49293 50337 51185 52679 52765 53181 53261 53376 53547 53672
## 2 2 1 2 2 3 3 1 3 2 2
## 53687 54254 54342 55002 55120 55888 56068 57260 57410 57670 57806
## 1 3 3 2 2 2 1 2 1 3 1
## 57811 58769 59667 60148 63412 64163 65813 66253 66258 67102 67316
## 1 2 3 3 3 1 3 3 1 2 2
## 67406 67529 67616 67638 67716 67995 68230 68427 68516 68656 68770
## 3 2 2 3 2 2 1 1 3 1 2
## 69010 69118 69479 69621 69627 70127 70175 70357 70553 71045 71151
## 1 3 2 3 2 2 2 2 3 1 2
## 71309 71802 71940 73441 74273 74284 74831 75153 75404 75661 76113
## 1 2 3 3 1 2 1 1 2 3 2
## 76869 76954 77058 77281 77674 77817 79345 79526 79574 80203 80254
## 1 1 3 3 2 3 1 2 3 3 3
## 80411 80568 80810 81744 82474 82677 83262 83746 84340 84980 85153
## 1 1 1 2 2 2 3 3 2 1 3
## 85158 85205 85250 86386 86641 86942 87705 87731 89829 90542 90750
## 2 3 2 2 2 3 3 3 2 3 3
## 90824 91002 91881 91960 92591 93124 93323 93894 93938 94140 94519
## 3 2 1 3 2 1 3 3 3 3 1
## 94756 96459 97084 97207 97739 98208 99005 100387 100494 100579 101089
## 2 2 1 1 1 2 2 1 3 3 2
#plot clusters - PCA
clusplot(train_pca %>% select(-readmitted),
y_hc_train_pca,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients (Hierarchical)- PCA data")
K-means Clustering - PCA
k_mean_pca = kmeans(x = train_pca %>% select(-readmitted), centers = 2)
y_means_pca = k_mean_pca$cluster
clusplot(train_pca %>% select(-readmitted),
y_means_pca,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients (K-means) - PCA data")
Optimum number of Clusters using Silhouette,Gap Statistic and Elbow Methods for PCA data
#Silhouette method and k-means - PCA
fviz_nbclust(train_pca %>% select(-readmitted), kmeans, method = "silhouette") + labs(subtitle = "Silhoutte method - PCA data")
#Silhouette method and hierarchical - PCA
fviz_nbclust(train_pca %>% select(-readmitted), hcut, method = "silhouette")+ labs(subtitle = "Silhouette method - PCA data")
#Gap statistic method and kmeans - PCA
fviz_nbclust(train_pca %>% select(-readmitted), kmeans, nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method - PCA data")
#Gap statistic method and hierarchical - PCA
fviz_nbclust(train_pca %>% select(-readmitted), hcut , nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method - PCA data")
#Elbow method and hierarchical - PCA
fviz_nbclust(train_pca %>% select(-readmitted), hcut, method = "wss")+ labs(subbtitle = "Elbow Method - PCA data")
#Elbow method and kmeans - PCA
fviz_nbclust(train_pca %>% select(-readmitted), kmeans, method = "wss")+ labs(subbtitle = "Elbow Method - PCA data")
Plot Clusters based on outcome of above methods
#categorize into 4 clusters - PCA
y_hc_train_plot_pca = cutree(hc_train, 4)
y_hc_train_plot_pca
## 163 594 697 772 1281 1756 2059 2080 2484 3277 4104
## 1 2 3 1 3 1 2 1 2 2 2
## 4857 4967 5149 5313 5388 5771 6953 8808 9119 10040 11979
## 3 2 4 2 3 2 2 3 3 2 3
## 12275 12838 13235 13365 13549 13634 14644 15583 16431 16920 17097
## 2 1 2 2 3 3 2 2 4 3 3
## 17710 17836 17839 22004 22438 23303 23465 24103 24810 25950 26277
## 2 1 1 2 2 3 2 3 2 2 2
## 26379 26453 26497 26826 26840 27472 27669 28370 28491 28772 28951
## 1 1 2 2 1 3 2 2 3 3 4
## 29058 29127 29135 29181 29256 29485 29487 29822 30885 31158 31165
## 1 1 3 2 3 2 2 3 1 2 1
## 31275 31319 31767 32134 32700 32829 32948 33020 33439 33946 35185
## 1 1 3 2 2 2 3 3 1 2 2
## 35443 35777 36745 36987 37015 37249 37402 37411 37656 37954 37987
## 2 1 1 3 1 2 2 1 3 2 4
## 38180 38566 39013 39068 39275 39356 39362 40311 40319 40338 40749
## 3 2 3 2 2 1 2 1 3 2 2
## 40816 40971 41134 41729 42248 42257 43185 43450 44040 44171 46386
## 2 2 2 1 2 2 2 2 2 3 1
## 48474 49293 50337 51185 52679 52765 53181 53261 53376 53547 53672
## 2 2 1 2 2 3 3 1 3 2 2
## 53687 54254 54342 55002 55120 55888 56068 57260 57410 57670 57806
## 1 3 4 2 2 2 1 2 1 3 1
## 57811 58769 59667 60148 63412 64163 65813 66253 66258 67102 67316
## 1 2 3 4 3 1 3 3 1 2 2
## 67406 67529 67616 67638 67716 67995 68230 68427 68516 68656 68770
## 3 2 2 3 2 2 1 1 3 1 2
## 69010 69118 69479 69621 69627 70127 70175 70357 70553 71045 71151
## 1 3 2 3 2 2 2 2 4 1 2
## 71309 71802 71940 73441 74273 74284 74831 75153 75404 75661 76113
## 1 2 3 3 1 2 1 1 2 3 2
## 76869 76954 77058 77281 77674 77817 79345 79526 79574 80203 80254
## 1 1 4 3 2 3 1 2 4 4 3
## 80411 80568 80810 81744 82474 82677 83262 83746 84340 84980 85153
## 1 1 1 2 2 2 3 3 2 1 3
## 85158 85205 85250 86386 86641 86942 87705 87731 89829 90542 90750
## 2 4 2 2 2 4 3 3 2 3 3
## 90824 91002 91881 91960 92591 93124 93323 93894 93938 94140 94519
## 3 2 1 4 2 1 3 3 3 4 1
## 94756 96459 97084 97207 97739 98208 99005 100387 100494 100579 101089
## 2 2 1 1 1 2 2 1 3 4 2
#plot clusters - PCA
clusplot(train_pca %>% select(-readmitted),
y_hc_train_plot_pca,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients(Hierarchical):Based on methods' outcome w/ PCA data")
K-means Clustering - PCA
k_mean_plot_pca = kmeans(x = train_pca %>% select(-readmitted), centers = 4)
y_means_plot_pca = k_mean_plot_pca$cluster
clusplot(train_pca %>% select(-readmitted),
y_means_plot_pca,
lines = 0,
shade = T,
color = T,
labels = 2,
plotchar = F,
span = T,
main = "Clusters of Patients (K-means):Based on methods' outcome w/ PCA data")
Supervised Models
#Initiate model outcomes comparison lists
Model_Names <- c()
Model_AUC_Values <- c()
Logistic Regression
#LR model1: all variables
lr_model1 <- glm(formula = readmitted~., family = binomial, data = train_set)
summary(lr_model1)
##
## Call:
## glm(formula = readmitted ~ ., family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2578 -0.8508 0.4198 0.7898 1.9126
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.671189 8.143002 0.082 0.9343
## time_in_hospital 0.505113 0.229413 2.202 0.0277 *
## num_lab_procedures -0.613587 0.203776 -3.011 0.0026 **
## num_procedures -0.165364 0.194378 -0.851 0.3949
## num_medications 0.437743 0.254940 1.717 0.0860 .
## number_outpatient 0.158439 0.248601 0.637 0.5239
## number_inpatient 0.426799 0.253119 1.686 0.0918 .
## number_diagnoses -0.055302 0.286420 -0.193 0.8469
## race_AfricanAmerican -0.557919 0.293540 -1.901 0.0573 .
## race_Caucasian -0.210426 0.342733 -0.614 0.5392
## race_Hispanic -0.052851 0.283040 -0.187 0.8519
## gender.Female 0.179833 0.185335 0.970 0.3319
## gender.Male NA NA NA NA
## age_30_40 -0.609509 0.286917 -2.124 0.0336 *
## age_40_50 -0.555756 0.381370 -1.457 0.1450
## age_50_60 -0.784620 0.451991 -1.736 0.0826 .
## age_60_70 -0.793687 0.461006 -1.722 0.0851 .
## age_70_80 -0.822054 0.456394 -1.801 0.0717 .
## age_80_90 -0.791266 0.421328 -1.878 0.0604 .
## admission_type_id.1 0.838138 0.568296 1.475 0.1403
## admission_type_id.6 0.512948 0.618424 0.829 0.4069
## discharge_disposition_id.1 0.204641 0.311731 0.656 0.5115
## discharge_disposition_id.2 0.365556 0.230544 1.586 0.1128
## discharge_disposition_id.3 0.368761 0.266565 1.383 0.1665
## discharge_disposition_id.6 -0.125522 0.248839 -0.504 0.6140
## admission_source_id.1 -4.158026 270.909636 -0.015 0.9878
## admission_source_id.7 -4.719514 282.163384 -0.017 0.9867
## max_glu_serum_200 0.042715 0.195501 0.218 0.8270
## max_glu_serum_300 0.062625 0.258818 0.242 0.8088
## max_glu_serum_Norm NA NA NA NA
## A1Cresult_7 -0.016264 0.231401 -0.070 0.9440
## A1Cresult_8 -0.001258 0.263973 -0.005 0.9962
## A1Cresult_Norm NA NA NA NA
## med_change_yes 0.065503 0.229605 0.285 0.7754
## med_change_no NA NA NA NA
## diabetes_med_yes -0.040807 0.217926 -0.187 0.8515
## diabetes_med_no NA NA NA NA
## Circulatory_diag -0.101579 0.225510 -0.450 0.6524
## Respiraoty_diag 0.057752 0.206678 0.279 0.7799
## Digestive_diag 0.017185 0.193284 0.089 0.9292
## Diabetes_diag -0.444971 0.203907 -2.182 0.0291 *
## Injury_diag 0.027498 0.193633 0.142 0.8871
## Genitourinary_diag -0.096998 0.177963 -0.545 0.5857
## Other_diag -0.449687 0.228882 -1.965 0.0494 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 235.20 on 192 degrees of freedom
## AIC: 313.2
##
## Number of Fisher Scoring iterations: 14
#LR model2 - with all significant variables from lr model1
lr_model2 <- glm(formula = readmitted~ time_in_hospital + num_lab_procedures + num_medications + number_outpatient + number_inpatient + age_30_40+ age_50_60 +age_60_70 + age_70_80+ age_80_90 + Diabetes_diag + Other_diag, family = binomial, data = train_set)
summary(lr_model2)
##
## Call:
## glm(formula = readmitted ~ time_in_hospital + num_lab_procedures +
## num_medications + number_outpatient + number_inpatient +
## age_30_40 + age_50_60 + age_60_70 + age_70_80 + age_80_90 +
## Diabetes_diag + Other_diag, family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4497 -1.0196 0.5353 0.8958 1.7010
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5189 0.1563 3.320 0.00090 ***
## time_in_hospital 0.4573 0.1852 2.469 0.01355 *
## num_lab_procedures -0.5495 0.1713 -3.209 0.00133 **
## num_medications 0.3573 0.2030 1.760 0.07841 .
## number_outpatient 0.1888 0.2039 0.926 0.35456
## number_inpatient 0.5245 0.2317 2.264 0.02360 *
## age_30_40 -0.2917 0.1715 -1.701 0.08892 .
## age_50_60 -0.2529 0.2086 -1.213 0.22526
## age_60_70 -0.2871 0.2205 -1.302 0.19304
## age_70_80 -0.2396 0.2109 -1.136 0.25596
## age_80_90 -0.2614 0.2034 -1.285 0.19882
## Diabetes_diag -0.2969 0.1615 -1.838 0.06599 .
## Other_diag -0.4166 0.1652 -2.521 0.01169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 262.76 on 218 degrees of freedom
## AIC: 288.76
##
## Number of Fisher Scoring iterations: 5
#LR model3
lr_model3 <- glm(formula = readmitted~ time_in_hospital + num_lab_procedures + number_inpatient + Other_diag, family = binomial, data = train_set)
summary(lr_model3)
##
## Call:
## glm(formula = readmitted ~ time_in_hospital + num_lab_procedures +
## number_inpatient + Other_diag, family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3390 -1.0886 0.5733 0.9820 1.7639
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5104 0.1527 3.342 0.000831 ***
## time_in_hospital 0.4978 0.1639 3.037 0.002391 **
## num_lab_procedures -0.3767 0.1533 -2.458 0.013987 *
## number_inpatient 0.6997 0.2202 3.177 0.001486 **
## Other_diag -0.4160 0.1477 -2.817 0.004852 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 275.51 on 226 degrees of freedom
## AIC: 285.51
##
## Number of Fisher Scoring iterations: 4
#LR model 4: only most significant variable (**)
lr_model4 <- glm(formula = readmitted~ time_in_hospital + number_inpatient + Other_diag, family = binomial, data = train_set)
summary(lr_model4)
##
## Call:
## glm(formula = readmitted ~ time_in_hospital + number_inpatient +
## Other_diag, family = binomial, data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1614 -1.1161 0.6079 1.0182 1.5116
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4934 0.1498 3.294 0.000988 ***
## time_in_hospital 0.3734 0.1501 2.488 0.012836 *
## number_inpatient 0.6944 0.2139 3.247 0.001166 **
## Other_diag -0.3789 0.1446 -2.620 0.008801 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 281.78 on 227 degrees of freedom
## AIC: 289.78
##
## Number of Fisher Scoring iterations: 4
#performance comparison
compare_performance(lr_model1, lr_model2, lr_model3, lr_model4)
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | Score_spherical | PCP
## -----------------------------------------------------------------------------------------------------------------------------------------------
## lr_model1 | glm | 313.2 (<.001) | 329.5 (<.001) | 447.5 (<.001) | 0.293 | 0.414 | 1.107 | 0.509 | -Inf | 0.004 | 0.660
## lr_model2 | glm | 288.8 (0.149) | 290.4 (0.079) | 333.5 (<.001) | 0.192 | 0.441 | 1.098 | 0.569 | -151.099 | 0.004 | 0.611
## lr_model3 | glm | 285.5 (0.761) | 285.8 (0.819) | 302.7 (0.602) | 0.145 | 0.453 | 1.104 | 0.596 | -145.517 | 0.004 | 0.589
## lr_model4 | glm | 289.8 (0.090) | 290.0 (0.101) | 303.5 (0.398) | 0.118 | 0.460 | 1.114 | 0.610 | -141.505 | 0.005 | 0.576
Logistic regression predictions on test data using LR model3 of train data (model with lowest AIC value)
lr_result = predict(lr_model3, test_set, type = "response")
lr_result
## 461 824 962 1984 2309 5016 6129 6452
## 0.5754908 0.2579972 0.3007636 0.7266454 0.7538063 0.4294611 0.8840238 0.7379612
## 7698 9471 11728 15359 15538 17795 18511 20963
## 0.7113244 0.2317860 0.9085133 0.1734862 0.6725922 0.6765962 0.6281069 0.3213621
## 23943 25851 29060 29366 30649 31935 32130 34606
## 0.2377085 0.6452716 0.4060798 0.7415831 0.6642826 0.6535344 0.9284307 0.5916334
## 35975 36349 36685 39784 39833 40684 44259 44286
## 0.5690315 0.8791495 0.6486647 0.3281264 0.9131998 0.4768299 0.7098697 0.7586708
## 46721 51517 57291 57846 58435 60051 65125 66118
## 0.6509185 0.5451385 0.6674764 0.6764589 0.7699843 0.4025018 0.9542761 0.3095083
## 66395 68026 68269 68348 70681 71862 72980 74933
## 0.6192066 0.4253061 0.4333837 0.5862032 0.6788687 0.3624103 0.9924719 0.4775360
## 76453 76775 76814 77043 78993 79192 82660 84909
## 0.6506927 0.8744509 0.6827756 0.4423166 0.4793012 0.2737639 0.6213054 0.5606325
## 88479 101030
## 0.4240962 0.4537406
#Thresholding to obtain class for confusion matrix
result_class0.5 = ifelse(lr_result >=0.5, 1, 0)
result_class0.5
## 461 824 962 1984 2309 5016 6129 6452 7698 9471 11728
## 1 0 0 1 1 0 1 1 1 0 1
## 15359 15538 17795 18511 20963 23943 25851 29060 29366 30649 31935
## 0 1 1 1 0 0 1 0 1 1 1
## 32130 34606 35975 36349 36685 39784 39833 40684 44259 44286 46721
## 1 1 1 1 1 0 1 0 1 1 1
## 51517 57291 57846 58435 60051 65125 66118 66395 68026 68269 68348
## 1 1 1 1 0 1 0 1 0 0 1
## 70681 71862 72980 74933 76453 76775 76814 77043 78993 79192 82660
## 1 0 1 0 1 1 1 0 0 0 1
## 84909 88479 101030
## 1 0 0
table(test_set$readmitted, result_class0.5)
## result_class0.5
## 0 1
## No 13 10
## Yes 8 27
#Accuracy (threshold 0.5)
accuracy0.5 <- (16+37)/(16+37+19+15)
accuracy0.5
## [1] 0.6091954
#Sensitivty(threshold = 0.5)
#sensitivity = TP/(TP + FN)
sensitivity0.5 = 16/(16+19)
sensitivity0.5
## [1] 0.4571429
#Specificity(threshold = 0.5)
#specificity = TN/(TN+FP)
specificity0.5 = 37/(37+15)
specificity0.5
## [1] 0.7115385
#LR ROC
lr_roc <- roc(test_set$readmitted, lr_result, plot = TRUE, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#LR AUC
lr_auc <- auc(test_set$readmitted, lr_result)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
lr_auc
## Area under the curve: 0.7379
#Append comparison lists
Model_Names <- append(Model_Names, "Logistic Regression")
Model_AUC_Values <- append(Model_AUC_Values, lr_auc)
Elastic Net
#GLMNet
glmnet_model = cv.glmnet(as.matrix(train_set %>% select(-readmitted)), train_set$readmitted, family = "binomial", alpha = 0.5)
plot(glmnet_model)
glmnet_model
##
## Call: cv.glmnet(x = as.matrix(train_set %>% select(-readmitted)), y = train_set$readmitted, family = "binomial", alpha = 0.5)
##
## Measure: Binomial Deviance
##
## Lambda Index Measure SE Nonzero
## min 0.04374 19 1.280 0.03130 21
## 1se 0.08389 12 1.309 0.02736 14
glmnet_model$lambda.min
## [1] 0.04374214
glmnet_model$lambda.1se
## [1] 0.0838935
#Retrieve coefficients
coef(glmnet_model, s = glmnet_model$lambda.min)
## 44 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.448369055
## time_in_hospital 0.224994675
## num_lab_procedures -0.236882822
## num_procedures .
## num_medications 0.203465214
## number_outpatient 0.016879812
## number_inpatient 0.316770733
## number_diagnoses .
## race_AfricanAmerican -0.197707893
## race_Caucasian .
## race_Hispanic .
## gender.Female 0.045637428
## gender.Male -0.042845674
## age_30_40 -0.056903353
## age_40_50 .
## age_50_60 .
## age_60_70 .
## age_70_80 .
## age_80_90 .
## admission_type_id.1 0.174575358
## admission_type_id.6 .
## discharge_disposition_id.1 .
## discharge_disposition_id.2 0.088496691
## discharge_disposition_id.3 0.099408984
## discharge_disposition_id.6 -0.097540298
## admission_source_id.1 .
## admission_source_id.7 -0.119507446
## max_glu_serum_200 .
## max_glu_serum_300 .
## max_glu_serum_Norm .
## A1Cresult_7 .
## A1Cresult_8 .
## A1Cresult_Norm .
## med_change_yes 0.008450632
## med_change_no -0.007006306
## diabetes_med_yes -0.005011454
## diabetes_med_no 0.003269135
## Circulatory_diag .
## Respiraoty_diag 0.071180199
## Digestive_diag .
## Diabetes_diag -0.114325117
## Injury_diag .
## Genitourinary_diag .
## Other_diag -0.207451761
coef(glmnet_model, s = glmnet_model$lambda.1se)
## 44 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.414940997
## time_in_hospital 0.110886976
## num_lab_procedures -0.096337939
## num_procedures .
## num_medications 0.164124330
## number_outpatient .
## number_inpatient 0.240549917
## number_diagnoses .
## race_AfricanAmerican -0.101037482
## race_Caucasian .
## race_Hispanic .
## gender.Female 0.009251185
## gender.Male -0.008286248
## age_30_40 .
## age_40_50 .
## age_50_60 .
## age_60_70 .
## age_70_80 .
## age_80_90 .
## admission_type_id.1 0.053206738
## admission_type_id.6 .
## discharge_disposition_id.1 .
## discharge_disposition_id.2 0.016630006
## discharge_disposition_id.3 0.039877769
## discharge_disposition_id.6 .
## admission_source_id.1 .
## admission_source_id.7 -0.015855431
## max_glu_serum_200 .
## max_glu_serum_300 .
## max_glu_serum_Norm .
## A1Cresult_7 .
## A1Cresult_8 .
## A1Cresult_Norm .
## med_change_yes .
## med_change_no .
## diabetes_med_yes .
## diabetes_med_no .
## Circulatory_diag .
## Respiraoty_diag 0.039137301
## Digestive_diag .
## Diabetes_diag -0.029490578
## Injury_diag .
## Genitourinary_diag .
## Other_diag -0.135774004
#Prediction on test dataset
glmnet_pred = predict(glmnet_model, newx=as.matrix(test_set%>% select (-readmitted)), type = "response", s=glmnet_model$lambda.1se)
glmnet_pred
## s1
## 461 0.5109715
## 824 0.4747913
## 962 0.4332065
## 1984 0.6122516
## 2309 0.6652422
## 5016 0.4512088
## 6129 0.6952248
## 6452 0.6277509
## 7698 0.6852019
## 9471 0.4157722
## 11728 0.6905312
## 15359 0.4366775
## 15538 0.6330190
## 17795 0.6091232
## 18511 0.5444884
## 20963 0.4865441
## 23943 0.4292829
## 25851 0.5053022
## 29060 0.5526081
## 29366 0.6091707
## 30649 0.6204999
## 31935 0.5721938
## 32130 0.7641792
## 34606 0.5414921
## 35975 0.5638125
## 36349 0.7194968
## 36685 0.6613101
## 39784 0.4035714
## 39833 0.7136276
## 40684 0.5514611
## 44259 0.5835540
## 44286 0.6873014
## 46721 0.6030548
## 51517 0.5285979
## 57291 0.6813965
## 57846 0.5419784
## 58435 0.6736669
## 60051 0.4745504
## 65125 0.7618593
## 66118 0.4725333
## 66395 0.6840611
## 68026 0.5640196
## 68269 0.4773355
## 68348 0.4706460
## 70681 0.7503135
## 71862 0.4339513
## 72980 0.8711432
## 74933 0.4133803
## 76453 0.4942775
## 76775 0.6524288
## 76814 0.6449857
## 77043 0.4979473
## 78993 0.6682822
## 79192 0.4783805
## 82660 0.6862439
## 84909 0.6337811
## 88479 0.5547753
## 101030 0.5645004
glmnet_class = ifelse(glmnet_pred > 0.5, 1, 0)
table(glmnet_class)
## glmnet_class
## 0 1
## 17 41
glmnet_roc <- roc(test_set$readmitted, as.numeric(glmnet_pred), auc = T, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#glmtnet AUC
glmnet_auc <- auc(glmnet_roc)
glmnet_auc
## Area under the curve: 0.7093
#Append comparison lists
Model_Names <- append(Model_Names, "GLMNet")
Model_AUC_Values <- append(Model_AUC_Values, glmnet_auc)
Decision Tree
#Model
d_tree = rpart(formula = readmitted ~., method = 'class', data = train_set)
#Prediction on test data
y_pred_tree = predict(d_tree, newdata = test_set)
y_pred_tree
## No Yes
## 461 0.8571429 0.1428571
## 824 0.6363636 0.3636364
## 962 0.7878788 0.2121212
## 1984 0.1428571 0.8571429
## 2309 0.1612903 0.8387097
## 5016 0.8571429 0.1428571
## 6129 0.1612903 0.8387097
## 6452 0.2745098 0.7254902
## 7698 0.2800000 0.7200000
## 9471 0.8571429 0.1428571
## 11728 0.3846154 0.6153846
## 15359 0.7878788 0.2121212
## 15538 0.2800000 0.7200000
## 17795 0.1612903 0.8387097
## 18511 0.1428571 0.8571429
## 20963 0.7878788 0.2121212
## 23943 0.7878788 0.2121212
## 25851 0.1428571 0.8571429
## 29060 1.0000000 0.0000000
## 29366 0.1612903 0.8387097
## 30649 0.2745098 0.7254902
## 31935 0.2745098 0.7254902
## 32130 0.1612903 0.8387097
## 34606 0.2745098 0.7254902
## 35975 0.2800000 0.7200000
## 36349 0.1612903 0.8387097
## 36685 0.2745098 0.7254902
## 39784 0.3846154 0.6153846
## 39833 0.1612903 0.8387097
## 40684 0.2745098 0.7254902
## 44259 0.2745098 0.7254902
## 44286 0.1612903 0.8387097
## 46721 0.2800000 0.7200000
## 51517 0.2745098 0.7254902
## 57291 0.1612903 0.8387097
## 57846 0.1612903 0.8387097
## 58435 0.1612903 0.8387097
## 60051 0.3846154 0.6153846
## 65125 0.1612903 0.8387097
## 66118 0.8571429 0.1428571
## 66395 0.2800000 0.7200000
## 68026 0.2745098 0.7254902
## 68269 0.3846154 0.6153846
## 68348 0.3846154 0.6153846
## 70681 0.1612903 0.8387097
## 71862 0.8571429 0.1428571
## 72980 0.1612903 0.8387097
## 74933 0.7878788 0.2121212
## 76453 0.1428571 0.8571429
## 76775 0.2745098 0.7254902
## 76814 0.2745098 0.7254902
## 77043 0.8750000 0.1250000
## 78993 0.2800000 0.7200000
## 79192 0.4285714 0.5714286
## 82660 0.1612903 0.8387097
## 84909 0.2800000 0.7200000
## 88479 0.2745098 0.7254902
## 101030 0.2800000 0.7200000
#Confusion matrix(treshold = 0.5)
y_pred_class0.5 = ifelse(y_pred_tree[, 'Yes'] >= 0.5, "Yes", "No" )
y_pred_class0.5
## 461 824 962 1984 2309 5016 6129 6452 7698 9471 11728
## "No" "No" "No" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "No" "Yes"
## 15359 15538 17795 18511 20963 23943 25851 29060 29366 30649 31935
## "No" "Yes" "Yes" "Yes" "No" "No" "Yes" "No" "Yes" "Yes" "Yes"
## 32130 34606 35975 36349 36685 39784 39833 40684 44259 44286 46721
## "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## 51517 57291 57846 58435 60051 65125 66118 66395 68026 68269 68348
## "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes"
## 70681 71862 72980 74933 76453 76775 76814 77043 78993 79192 82660
## "Yes" "No" "Yes" "No" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes"
## 84909 88479 101030
## "Yes" "Yes" "Yes"
cm_0.5 <- table(test_set$readmitted, y_pred_class0.5)
cm_0.5
## y_pred_class0.5
## No Yes
## No 7 16
## Yes 6 29
y_pred_class_fac0.5 = factor(y_pred_class0.5)
y_pred_class_fac0.5
## 461 824 962 1984 2309 5016 6129 6452 7698 9471 11728
## No No No Yes Yes No Yes Yes Yes No Yes
## 15359 15538 17795 18511 20963 23943 25851 29060 29366 30649 31935
## No Yes Yes Yes No No Yes No Yes Yes Yes
## 32130 34606 35975 36349 36685 39784 39833 40684 44259 44286 46721
## Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes
## 51517 57291 57846 58435 60051 65125 66118 66395 68026 68269 68348
## Yes Yes Yes Yes Yes Yes No Yes Yes Yes Yes
## 70681 71862 72980 74933 76453 76775 76814 77043 78993 79192 82660
## Yes No Yes No Yes Yes Yes No Yes Yes Yes
## 84909 88479 101030
## Yes Yes Yes
## Levels: No Yes
confusionMatrix(test_set$readmitted, y_pred_class_fac0.5)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 7 16
## Yes 6 29
##
## Accuracy : 0.6207
## 95% CI : (0.4837, 0.7449)
## No Information Rate : 0.7759
## P-Value [Acc > NIR] : 0.99764
##
## Kappa : 0.1436
##
## Mcnemar's Test P-Value : 0.05501
##
## Sensitivity : 0.5385
## Specificity : 0.6444
## Pos Pred Value : 0.3043
## Neg Pred Value : 0.8286
## Prevalence : 0.2241
## Detection Rate : 0.1207
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.5915
##
## 'Positive' Class : No
##
#Confusion matrix (threshold = 0.2)
y_pred_class0.2 = ifelse(y_pred_tree[, "Yes"]>=0.2, "Yes", "No")
y_pred_class0.2
## 461 824 962 1984 2309 5016 6129 6452 7698 9471 11728
## "No" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "No" "Yes"
## 15359 15538 17795 18511 20963 23943 25851 29060 29366 30649 31935
## "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes"
## 32130 34606 35975 36349 36685 39784 39833 40684 44259 44286 46721
## "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## 51517 57291 57846 58435 60051 65125 66118 66395 68026 68269 68348
## "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes"
## 70681 71862 72980 74933 76453 76775 76814 77043 78993 79192 82660
## "Yes" "No" "Yes" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes"
## 84909 88479 101030
## "Yes" "Yes" "Yes"
cm_0.2 <- table(test_set$readmitted, y_pred_class0.2)
cm_0.2
## y_pred_class0.2
## No Yes
## No 5 18
## Yes 2 33
y_pred_class_fac0.2 = factor(y_pred_class0.2)
table(y_pred_class_fac0.2)
## y_pred_class_fac0.2
## No Yes
## 7 51
confusionMatrix(test_set$readmitted, y_pred_class_fac0.2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 5 18
## Yes 2 33
##
## Accuracy : 0.6552
## 95% CI : (0.5188, 0.7751)
## No Information Rate : 0.8793
## P-Value [Acc > NIR] : 0.9999981
##
## Kappa : 0.1819
##
## Mcnemar's Test P-Value : 0.0007962
##
## Sensitivity : 0.71429
## Specificity : 0.64706
## Pos Pred Value : 0.21739
## Neg Pred Value : 0.94286
## Prevalence : 0.12069
## Detection Rate : 0.08621
## Detection Prevalence : 0.39655
## Balanced Accuracy : 0.68067
##
## 'Positive' Class : No
##
#DT ROC
dt_roc = roc(test_set$readmitted, y_pred_tree[, 'Yes'])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
dt_roc_plot <- plot(dt_roc, print.auc = TRUE)
#DT AUC
dt_auc <- auc(dt_roc_plot)
dt_auc
## Area under the curve: 0.6261
#Append comparison lists
Model_Names <- append(Model_Names, "Decision Tree")
Model_AUC_Values <- append(Model_AUC_Values, dt_auc)
#Plotting
plot(d_tree, uniform = TRUE, main = "Readmission Prediction")
text(d_tree, use.n = TRUE, all = TRUE)
prp(d_tree, type = 1, extra = 3, under = TRUE, varlen = 0, tweak = 1, main = "Readmission Prediction")
Random Forest
#RF model
rf_model <- randomForest(formula = readmitted ~., method = 'class', data = train_set)
rf_model
##
## Call:
## randomForest(formula = readmitted ~ ., data = train_set, method = "class")
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 39.83%
## Confusion matrix:
## No Yes class.error
## No 32 61 0.6559140
## Yes 31 107 0.2246377
#Class for confusion matrix
y_pred_rf = predict(rf_model, newdata = test_set %>% select(-readmitted))
#Probability for ROC And AUC
y_prob_rf = predict(rf_model, newdata = test_set, type = 'prob')
y_prob_rf
## No Yes
## 461 0.322 0.678
## 824 0.548 0.452
## 962 0.688 0.312
## 1984 0.436 0.564
## 2309 0.348 0.652
## 5016 0.674 0.326
## 6129 0.268 0.732
## 6452 0.320 0.680
## 7698 0.252 0.748
## 9471 0.656 0.344
## 11728 0.460 0.540
## 15359 0.550 0.450
## 15538 0.258 0.742
## 17795 0.414 0.586
## 18511 0.544 0.456
## 20963 0.634 0.366
## 23943 0.692 0.308
## 25851 0.630 0.370
## 29060 0.538 0.462
## 29366 0.398 0.602
## 30649 0.452 0.548
## 31935 0.466 0.534
## 32130 0.216 0.784
## 34606 0.514 0.486
## 35975 0.356 0.644
## 36349 0.166 0.834
## 36685 0.230 0.770
## 39784 0.630 0.370
## 39833 0.202 0.798
## 40684 0.490 0.510
## 44259 0.476 0.524
## 44286 0.210 0.790
## 46721 0.276 0.724
## 51517 0.604 0.396
## 57291 0.462 0.538
## 57846 0.396 0.604
## 58435 0.326 0.674
## 60051 0.678 0.322
## 65125 0.162 0.838
## 66118 0.552 0.448
## 66395 0.362 0.638
## 68026 0.366 0.634
## 68269 0.782 0.218
## 68348 0.612 0.388
## 70681 0.200 0.800
## 71862 0.550 0.450
## 72980 0.178 0.822
## 74933 0.870 0.130
## 76453 0.566 0.434
## 76775 0.326 0.674
## 76814 0.270 0.730
## 77043 0.532 0.468
## 78993 0.154 0.846
## 79192 0.462 0.538
## 82660 0.382 0.618
## 84909 0.324 0.676
## 88479 0.450 0.550
## 101030 0.340 0.660
## attr(,"class")
## [1] "matrix" "array" "votes"
#RF ROC
rf_roc <- roc(test_set$readmitted, y_prob_rf[, "Yes"], plot = TRUE, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#RF AUC
rf_auc <- auc(rf_roc)
rf_auc
## Area under the curve: 0.6938
#Feature Importance
importance(rf_model)
## MeanDecreaseGini
## time_in_hospital 8.342252
## num_lab_procedures 11.867214
## num_procedures 3.712192
## num_medications 13.245162
## number_outpatient 1.414403
## number_inpatient 5.973916
## number_diagnoses 3.779270
## race_AfricanAmerican 2.244809
## race_Caucasian 2.153812
## race_Hispanic 1.109640
## gender.Female 2.355329
## gender.Male 2.454000
## age_30_40 1.232779
## age_40_50 1.105829
## age_50_60 1.650378
## age_60_70 1.575285
## age_70_80 1.744514
## age_80_90 1.446605
## admission_type_id.1 1.115223
## admission_type_id.6 1.072231
## discharge_disposition_id.1 2.139967
## discharge_disposition_id.2 1.134374
## discharge_disposition_id.3 1.690191
## discharge_disposition_id.6 1.585194
## admission_source_id.1 1.063924
## admission_source_id.7 1.388172
## max_glu_serum_200 1.871660
## max_glu_serum_300 1.640048
## max_glu_serum_Norm 1.669858
## A1Cresult_7 1.472351
## A1Cresult_8 1.940613
## A1Cresult_Norm 1.551391
## med_change_yes 1.404087
## med_change_no 1.550146
## diabetes_med_yes 1.854193
## diabetes_med_no 1.824120
## Circulatory_diag 1.954859
## Respiraoty_diag 2.425489
## Digestive_diag 1.064355
## Diabetes_diag 2.316744
## Injury_diag 1.020229
## Genitourinary_diag 1.376744
## Other_diag 3.002588
SVM
#SVM model(kernel = linear)
classifier_linear = svm(formula = readmitted~., data = train_variance, kernel = "linear")
classifier_linear
##
## Call:
## svm(formula = readmitted ~ ., data = train_variance, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 156
#Predictions on test data
y_pred_svml = predict(classifier_linear, newdata = test_variance)
#Confusion matrix
cm_svml = table(test_variance$readmitted, y_pred_svml)
cm_svml
## y_pred_svml
## No Yes
## No 15 8
## Yes 13 22
confusionMatrix(test_variance$readmitted, y_pred_svml)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 15 8
## Yes 13 22
##
## Accuracy : 0.6379
## 95% CI : (0.5012, 0.7601)
## No Information Rate : 0.5172
## P-Value [Acc > NIR] : 0.0431
##
## Kappa : 0.2707
##
## Mcnemar's Test P-Value : 0.3827
##
## Sensitivity : 0.5357
## Specificity : 0.7333
## Pos Pred Value : 0.6522
## Neg Pred Value : 0.6286
## Prevalence : 0.4828
## Detection Rate : 0.2586
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6345
##
## 'Positive' Class : No
##
#SVM model (kernel = radial)
classifier_radial = svm(formula = readmitted~., data = train_variance, kernel = "radial")
classifier_radial
##
## Call:
## svm(formula = readmitted ~ ., data = train_variance, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 210
#Prediction on test data
y_pred_svmr = predict(classifier_radial, newdata = test_variance)
y_pred_svmr
## 461 824 962 1984 2309 5016 6129 6452 7698 9471 11728
## Yes No No Yes Yes No Yes Yes Yes No Yes
## 15359 15538 17795 18511 20963 23943 25851 29060 29366 30649 31935
## No Yes No Yes No No No No Yes Yes Yes
## 32130 34606 35975 36349 36685 39784 39833 40684 44259 44286 46721
## Yes No Yes Yes Yes No Yes No No Yes Yes
## 51517 57291 57846 58435 60051 65125 66118 66395 68026 68269 68348
## No Yes Yes Yes No Yes No Yes Yes No No
## 70681 71862 72980 74933 76453 76775 76814 77043 78993 79192 82660
## Yes Yes Yes No No Yes Yes No Yes Yes Yes
## 84909 88479 101030
## Yes Yes Yes
## Levels: No Yes
#Confusion matrix
cm_svmr = table(test_variance$readmitted, y_pred_svmr)
cm_svmr
## y_pred_svmr
## No Yes
## No 13 10
## Yes 9 26
confusionMatrix(test_variance$readmitted, y_pred_svmr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 13 10
## Yes 9 26
##
## Accuracy : 0.6724
## 95% CI : (0.5366, 0.7899)
## No Information Rate : 0.6207
## P-Value [Acc > NIR] : 0.2514
##
## Kappa : 0.3104
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5909
## Specificity : 0.7222
## Pos Pred Value : 0.5652
## Neg Pred Value : 0.7429
## Prevalence : 0.3793
## Detection Rate : 0.2241
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6566
##
## 'Positive' Class : No
##
#SVM tune
svm_tune <- tune(svm, train.x = train_variance %>% select(-readmitted),
train.y = train_variance$readmitted,
kernel = "radial", ranges = list(cost = 10^(-1:2), gamma = c(0.25, 0.5, 1,2)))
svm_tune
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 0.25
##
## - best performance: 0.3934783
#SVM model using best cost and gamma parameters obtained after tuning.
classifier_cg = svm(formula = as.factor(train_variance$readmitted)~., data = train_variance, type = "C-classification", kernel = "radial", cost = 10, gamma = 0.25)
#Predictions on test set
y_pred_cg = predict(classifier_cg, newdata = test_variance)
cm_cg = table(test_variance$readmitted, y_pred_cg)
cm_cg
## y_pred_cg
## No Yes
## No 0 23
## Yes 1 34
confusionMatrix(test_variance$readmitted, y_pred_cg)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 0 23
## Yes 1 34
##
## Accuracy : 0.5862
## 95% CI : (0.4493, 0.714)
## No Information Rate : 0.9828
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0342
##
## Mcnemar's Test P-Value : 1.814e-05
##
## Sensitivity : 0.00000
## Specificity : 0.59649
## Pos Pred Value : 0.00000
## Neg Pred Value : 0.97143
## Prevalence : 0.01724
## Detection Rate : 0.00000
## Detection Prevalence : 0.39655
## Balanced Accuracy : 0.29825
##
## 'Positive' Class : No
##
summary(classifier_cg)
##
## Call:
## svm(formula = as.factor(train_variance$readmitted) ~ ., data = train_variance,
## type = "C-classification", kernel = "radial", cost = 10, gamma = 0.25)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 231
##
## ( 138 93 )
##
##
## Number of Classes: 2
##
## Levels:
## No Yes
#SVM ROC
svm_roc = roc(test_variance$readmitted, factor(y_pred_cg, ordered = T), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#SVM AUC
svm_auc <- auc(svm_roc)
svm_auc
## Area under the curve: 0.4857
#Append comparison lists
Model_Names <- append(Model_Names, "SVM")
Model_AUC_Values <- append(Model_AUC_Values, svm_auc)
K-nn
#Determine best k value
vec = c()
k_vec = c()
for(k in 1:80){
y_pred_knn = knn(train = train_set %>% select(-readmitted),
test = test_set %>% select(-readmitted),
cl = train_set$readmitted,
k=k)
error = mean(y_pred_knn != test_set$readmitted)
k_vec = c(k_vec, k)
vec = c(vec, error)
}
cbind(vec)
## vec
## [1,] 0.4827586
## [2,] 0.4482759
## [3,] 0.4482759
## [4,] 0.3793103
## [5,] 0.4310345
## [6,] 0.4482759
## [7,] 0.4310345
## [8,] 0.3793103
## [9,] 0.3448276
## [10,] 0.3965517
## [11,] 0.3275862
## [12,] 0.2931034
## [13,] 0.2931034
## [14,] 0.3103448
## [15,] 0.2586207
## [16,] 0.2758621
## [17,] 0.2931034
## [18,] 0.3103448
## [19,] 0.3103448
## [20,] 0.3103448
## [21,] 0.3448276
## [22,] 0.3275862
## [23,] 0.3275862
## [24,] 0.2931034
## [25,] 0.3448276
## [26,] 0.3448276
## [27,] 0.3448276
## [28,] 0.2931034
## [29,] 0.3275862
## [30,] 0.3103448
## [31,] 0.3103448
## [32,] 0.2931034
## [33,] 0.2758621
## [34,] 0.2758621
## [35,] 0.2758621
## [36,] 0.2931034
## [37,] 0.2931034
## [38,] 0.3275862
## [39,] 0.3275862
## [40,] 0.3275862
## [41,] 0.3103448
## [42,] 0.3620690
## [43,] 0.3275862
## [44,] 0.3103448
## [45,] 0.3103448
## [46,] 0.3103448
## [47,] 0.3275862
## [48,] 0.3103448
## [49,] 0.3275862
## [50,] 0.3103448
## [51,] 0.3620690
## [52,] 0.3103448
## [53,] 0.3103448
## [54,] 0.3448276
## [55,] 0.3448276
## [56,] 0.3275862
## [57,] 0.3620690
## [58,] 0.3275862
## [59,] 0.3620690
## [60,] 0.3448276
## [61,] 0.3620690
## [62,] 0.3620690
## [63,] 0.3275862
## [64,] 0.2758621
## [65,] 0.3275862
## [66,] 0.3448276
## [67,] 0.3448276
## [68,] 0.3275862
## [69,] 0.3793103
## [70,] 0.3620690
## [71,] 0.3793103
## [72,] 0.3448276
## [73,] 0.3448276
## [74,] 0.3275862
## [75,] 0.3448276
## [76,] 0.3620690
## [77,] 0.3448276
## [78,] 0.3620690
## [79,] 0.3275862
## [80,] 0.3448276
min(vec)
## [1] 0.2586207
df_error = data.frame(k_vec, vec)
ggplot(df_error, aes(k_vec, vec)) +geom_line()
#Extract minimum value
min_vec <- k_vec[which.min(vec)]
min_vec
## [1] 15
#K-nn model using best k value from graph
y_pred_k = knn(train = train_set %>% select(-readmitted),
test = test_set %>% select(-readmitted),
cl = train_set$readmitted,
k = min_vec, prob = T)
y_pred_k
## [1] Yes Yes No Yes Yes No Yes Yes Yes No No No Yes Yes No No No No No
## [20] Yes No No Yes No Yes Yes Yes No No No No Yes Yes No Yes Yes Yes No
## [39] Yes No Yes Yes No No Yes No Yes No No Yes Yes No Yes Yes Yes Yes Yes
## [58] Yes
## attr(,"prob")
## [1] 0.6000000 0.5333333 0.6000000 0.6000000 0.5333333 0.7333333 0.5333333
## [8] 0.8000000 0.6666667 0.6000000 0.7333333 0.6000000 0.6666667 0.5333333
## [15] 0.6000000 0.6000000 0.7500000 0.6666667 0.7333333 0.5333333 0.5333333
## [22] 0.6000000 0.5333333 0.6666667 0.6000000 0.8666667 0.6000000 0.5333333
## [29] 0.5333333 0.7333333 0.6666667 0.8000000 0.8666667 0.6666667 0.5333333
## [36] 0.6000000 0.6000000 0.6000000 0.5333333 0.5333333 0.8000000 0.5333333
## [43] 0.7333333 0.6666667 0.8666667 0.6000000 0.6666667 0.6666667 0.7333333
## [50] 0.6000000 0.6666667 0.6666667 0.8666667 0.6000000 0.6000000 0.8000000
## [57] 0.5333333 0.6000000
## Levels: No Yes
#Confusion matrix
confusionMatrix(test_set$readmitted, y_pred_k, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 17 6
## Yes 9 26
##
## Accuracy : 0.7414
## 95% CI : (0.6096, 0.8474)
## No Information Rate : 0.5517
## P-Value [Acc > NIR] : 0.002296
##
## Kappa : 0.4714
##
## Mcnemar's Test P-Value : 0.605577
##
## Sensitivity : 0.8125
## Specificity : 0.6538
## Pos Pred Value : 0.7429
## Neg Pred Value : 0.7391
## Prevalence : 0.5517
## Detection Rate : 0.4483
## Detection Prevalence : 0.6034
## Balanced Accuracy : 0.7332
##
## 'Positive' Class : Yes
##
#K-nn ROC
knn_roc <- roc(test_set$readmitted, attr(y_pred_k, 'prob'), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases
#K-nn AUC
knn_auc<- auc(knn_roc)
knn_auc
## Area under the curve: 0.5329
#Append comparison lists
Model_Names <- append(Model_Names, "K-nn")
Model_AUC_Values <- append(Model_AUC_Values, knn_auc)
XGBoost.
train_set$readmitted <- ifelse(train_set$readmitted == "Yes", 1, 0)
test_set$readmitted <- ifelse(test_set$readmitted == "Yes", 1, 0)
str(train_set$readmitted)
## num [1:231] 1 0 0 1 1 1 1 1 0 1 ...
str(test_set$readmitted)
## num [1:58] 1 1 1 1 1 1 1 1 0 0 ...
summary(train_set$readmitted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5974 1.0000 1.0000
#XGBoost model
xgb_classifier = xgboost(data = as.matrix(train_set %>% select(-readmitted)), label = train_set$readmitted, nrounds = 10)
## [1] train-rmse:0.424124
## [2] train-rmse:0.372133
## [3] train-rmse:0.324329
## [4] train-rmse:0.278860
## [5] train-rmse:0.256454
## [6] train-rmse:0.229002
## [7] train-rmse:0.202807
## [8] train-rmse:0.180816
## [9] train-rmse:0.167502
## [10] train-rmse:0.156990
#Prediction on test dataset
xgb_y_prob = predict(xgb_classifier, newdata = as.matrix(test_set %>% select(-readmitted)), type='prob')
head(xgb_y_prob)
## [1] 0.783411264 0.144975439 0.002184162 0.461754203 0.400559574 0.266678959
xgb_y_prob
## [1] 0.783411264 0.144975439 0.002184162 0.461754203 0.400559574
## [6] 0.266678959 0.584059536 0.609397054 0.834373891 -0.012347283
## [11] 0.844061494 0.275242180 0.844205260 0.669071317 0.573735237
## [16] 0.298393667 0.222442180 0.408580214 0.210570022 0.801152050
## [21] 0.354468495 0.348010361 0.968341291 0.568436027 0.717948437
## [26] 1.019045591 0.644306839 0.615441024 1.041798830 0.764605165
## [31] 0.544302821 0.879637778 0.944378018 0.223458529 0.601379514
## [36] 0.807839930 0.452889353 0.267640650 0.632538140 0.410308629
## [41] 0.785312235 0.727491140 0.139402330 0.340008676 0.841710329
## [46] 0.566241682 0.832976997 0.360108018 0.655890405 0.729740977
## [51] 0.607832670 0.334819108 0.860021234 0.499060154 0.878506184
## [56] 0.453960270 0.478604823 0.509950459
#Thresholding
xgb_y_pred = (xgb_y_prob >= 0.5)
head(xgb_y_pred)
## [1] TRUE FALSE FALSE FALSE FALSE FALSE
#Confusion Matrix
cm = table(test_set$readmitted, xgb_y_pred)
cm
## xgb_y_pred
## FALSE TRUE
## 0 13 10
## 1 11 24
imp_matrix = xgb.importance(model=xgb_classifier)
imp_matrix
## Feature Gain Cover Frequency
## 1: num_lab_procedures 0.2689344597 0.2436158192 0.233552632
## 2: num_medications 0.1429274797 0.1529943503 0.115131579
## 3: time_in_hospital 0.0996285091 0.0835404896 0.148026316
## 4: number_inpatient 0.0614227957 0.0841431262 0.055921053
## 5: race_AfricanAmerican 0.0449953651 0.0198870056 0.019736842
## 6: gender.Female 0.0366176337 0.0378907721 0.019736842
## 7: number_diagnoses 0.0276272189 0.0277212806 0.042763158
## 8: num_procedures 0.0252179684 0.0147645951 0.036184211
## 9: admission_source_id.1 0.0212936225 0.0067796610 0.016447368
## 10: age_30_40 0.0191114260 0.0206403013 0.023026316
## 11: discharge_disposition_id.6 0.0177406620 0.0519020716 0.016447368
## 12: Other_diag 0.0170822734 0.0192090395 0.013157895
## 13: age_70_80 0.0164914895 0.0166478343 0.016447368
## 14: Diabetes_diag 0.0153439623 0.0186817326 0.019736842
## 15: discharge_disposition_id.3 0.0141633244 0.0071563089 0.013157895
## 16: max_glu_serum_300 0.0139926016 0.0095668550 0.009868421
## 17: diabetes_med_yes 0.0136061144 0.0104708098 0.013157895
## 18: med_change_yes 0.0130305505 0.0042184557 0.003289474
## 19: A1Cresult_8 0.0127230970 0.0103954802 0.023026316
## 20: admission_source_id.7 0.0102342724 0.0488135593 0.013157895
## 21: Circulatory_diag 0.0099948213 0.0090395480 0.013157895
## 22: age_50_60 0.0098029727 0.0143126177 0.013157895
## 23: admission_type_id.1 0.0091304323 0.0084369115 0.009868421
## 24: max_glu_serum_Norm 0.0089660415 0.0067796610 0.006578947
## 25: number_outpatient 0.0086200964 0.0127306968 0.009868421
## 26: age_80_90 0.0078507218 0.0143879473 0.019736842
## 27: Respiraoty_diag 0.0074156417 0.0030131827 0.003289474
## 28: discharge_disposition_id.1 0.0071481078 0.0065536723 0.006578947
## 29: A1Cresult_7 0.0070480323 0.0048210923 0.006578947
## 30: race_Caucasian 0.0067382781 0.0070809793 0.009868421
## 31: Injury_diag 0.0063631801 0.0053483992 0.006578947
## 32: max_glu_serum_200 0.0036252880 0.0054990584 0.006578947
## 33: age_60_70 0.0032184927 0.0019585687 0.006578947
## 34: discharge_disposition_id.2 0.0030814088 0.0022598870 0.003289474
## 35: race_Hispanic 0.0030310561 0.0015065913 0.003289474
## 36: age_40_50 0.0028365706 0.0025612053 0.009868421
## 37: A1Cresult_Norm 0.0018955480 0.0033145009 0.006578947
## 38: Genitourinary_diag 0.0008495848 0.0009039548 0.003289474
## 39: Digestive_diag 0.0001988987 0.0004519774 0.003289474
## Feature Gain Cover Frequency
xgb.plot.importance(imp_matrix)
#Xgboost ROC
xgb_roc <- roc(test_set$readmitted, xgb_y_prob, plot = T, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#Xgboost AUC
xgb_auc <- auc(xgb_roc)
xgb_auc
## Area under the curve: 0.6497
#Append comparison lists
Model_Names <- append(Model_Names, "XGBoost")
Model_AUC_Values <- append(Model_AUC_Values, xgb_auc)
Models with PCA data
Logistic Regression using PCA data
#LR PCA model1
lr_pca1<- glm(readmitted~., data = train_pca, family = binomial)
summary(lr_pca1)
##
## Call:
## glm(formula = readmitted ~ ., family = binomial, data = train_pca)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8301 -1.1676 0.7582 0.9910 1.4137
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.98875 0.25875 3.821 0.000133 ***
## PC1 0.09139 0.03660 2.497 0.012529 *
## PC2 0.03068 0.03828 0.801 0.422918
## PC3 0.14388 0.04300 3.346 0.000821 ***
## PC4 -0.05130 0.03538 -1.450 0.147015
## PC5 -0.03863 0.04397 -0.879 0.379588
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 294.50 on 225 degrees of freedom
## AIC: 306.5
##
## Number of Fisher Scoring iterations: 4
#LR PCA model2
lr_pca2 <- glm(readmitted~ PC1 + PC3, data = train_pca, family = binomial)
summary(lr_pca2)
##
## Call:
## glm(formula = readmitted ~ PC1 + PC3, family = binomial, data = train_pca)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8601 -1.1934 0.7809 0.9935 1.4373
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.80126 0.18400 4.355 1.33e-05 ***
## PC1 0.07752 0.03412 2.272 0.02311 *
## PC3 0.13207 0.04043 3.267 0.00109 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 297.70 on 228 degrees of freedom
## AIC: 303.7
##
## Number of Fisher Scoring iterations: 4
#LR PCA model3
lr_pca3 <- glm(readmitted~ PC3, data = train_pca, family = binomial)
summary(lr_pca3)
##
## Call:
## glm(formula = readmitted ~ PC3, family = binomial, data = train_pca)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7831 -1.2565 0.8426 1.0093 1.4092
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.61685 0.16002 3.855 0.000116 ***
## PC3 0.10997 0.03894 2.824 0.004747 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 311.41 on 230 degrees of freedom
## Residual deviance: 303.09 on 229 degrees of freedom
## AIC: 307.09
##
## Number of Fisher Scoring iterations: 4
#Prediction of test pca data using LR_PCA model2 of train pca data (model with lowest AIC value)
lr_pca_predict <- predict(lr_pca2,newdata = test_pca, type = "response" )
#Thresholding to obtain class for confusion matrix
result_pca0.5 = ifelse(lr_pca_predict >=0.5, "Yes", "No")
result_pca0.5
## 461 824 962 1984 2309 5016 6129 6452 7698 9471 11728
## "Yes" "Yes" "No" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "No" "Yes"
## 15359 15538 17795 18511 20963 23943 25851 29060 29366 30649 31935
## "No" "Yes" "Yes" "Yes" "Yes" "No" "No" "Yes" "Yes" "Yes" "Yes"
## 32130 34606 35975 36349 36685 39784 39833 40684 44259 44286 46721
## "Yes" "No" "Yes" "Yes" "No" "No" "Yes" "Yes" "No" "Yes" "Yes"
## 51517 57291 57846 58435 60051 65125 66118 66395 68026 68269 68348
## "No" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "Yes" "No"
## 70681 71862 72980 74933 76453 76775 76814 77043 78993 79192 82660
## "Yes" "No" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## 84909 88479 101030
## "Yes" "Yes" "Yes"
table(test_pca$readmitted, result_pca0.5)
## result_pca0.5
## No Yes
## No 8 15
## Yes 7 28
result_pca_fac0.5 = factor(result_pca0.5)
#Confusion Matrix
confusionMatrix(test_pca$readmitted, result_pca_fac0.5)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 8 15
## Yes 7 28
##
## Accuracy : 0.6207
## 95% CI : (0.4837, 0.7449)
## No Information Rate : 0.7414
## P-Value [Acc > NIR] : 0.9851
##
## Kappa : 0.1572
##
## Mcnemar's Test P-Value : 0.1356
##
## Sensitivity : 0.5333
## Specificity : 0.6512
## Pos Pred Value : 0.3478
## Neg Pred Value : 0.8000
## Prevalence : 0.2586
## Detection Rate : 0.1379
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.5922
##
## 'Positive' Class : No
##
#LR PCA ROC
lr_pca_roc <-roc(test_pca$readmitted, lr_pca_predict, plot = TRUE, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#LR PCA AUC
lr_pca_auc <- auc(lr_pca_roc)
lr_pca_auc
## Area under the curve: 0.6447
#Append comparison lists
Model_Names <- append(Model_Names, "Logistic Regression PCA")
Model_AUC_Values <- append(Model_AUC_Values, lr_pca_auc)
SVM using PCA data
#SVM PCA model(kernel = linear)
classifier_pca_linear = svm(formula = readmitted~., data = train_pca, kernel = "linear")
classifier_pca_linear
##
## Call:
## svm(formula = readmitted ~ ., data = train_pca, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 187
#Predictions on test data
y_pred_svml_pca = predict(classifier_pca_linear, newdata = test_pca)
#Confusion matrix
cm_svml_pca = table(test_pca$readmitted, y_pred_svml_pca)
cm_svml_pca
## y_pred_svml_pca
## No Yes
## No 9 14
## Yes 6 29
confusionMatrix(test_pca$readmitted, y_pred_svml_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9 14
## Yes 6 29
##
## Accuracy : 0.6552
## 95% CI : (0.5188, 0.7751)
## No Information Rate : 0.7414
## P-Value [Acc > NIR] : 0.9469
##
## Kappa : 0.2338
##
## Mcnemar's Test P-Value : 0.1175
##
## Sensitivity : 0.6000
## Specificity : 0.6744
## Pos Pred Value : 0.3913
## Neg Pred Value : 0.8286
## Prevalence : 0.2586
## Detection Rate : 0.1552
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6372
##
## 'Positive' Class : No
##
#SVM model (kernel = radial)
classifier_radial_pca = svm(formula = readmitted~., data = train_pca, kernel = "radial")
classifier_radial_pca
##
## Call:
## svm(formula = readmitted ~ ., data = train_pca, kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 187
#Prediction on test data
y_pred_svmr_pca = predict(classifier_radial_pca, newdata = test_pca)
y_pred_svmr_pca
## 461 824 962 1984 2309 5016 6129 6452 7698 9471 11728
## Yes Yes Yes Yes Yes No No Yes Yes Yes Yes
## 15359 15538 17795 18511 20963 23943 25851 29060 29366 30649 31935
## No Yes No Yes Yes Yes Yes No No Yes No
## 32130 34606 35975 36349 36685 39784 39833 40684 44259 44286 46721
## Yes Yes Yes Yes Yes No Yes Yes No Yes Yes
## 51517 57291 57846 58435 60051 65125 66118 66395 68026 68269 68348
## No Yes Yes Yes Yes Yes No Yes Yes No No
## 70681 71862 72980 74933 76453 76775 76814 77043 78993 79192 82660
## Yes No Yes No Yes Yes Yes Yes Yes Yes Yes
## 84909 88479 101030
## Yes Yes Yes
## Levels: No Yes
#Confusion matrix
cm_svmr_pca = table(test_pca$readmitted, y_pred_svmr_pca)
cm_svmr_pca
## y_pred_svmr_pca
## No Yes
## No 7 16
## Yes 8 27
confusionMatrix(test_pca$readmitted, y_pred_svmr_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 7 16
## Yes 8 27
##
## Accuracy : 0.5862
## 95% CI : (0.4493, 0.714)
## No Information Rate : 0.7414
## P-Value [Acc > NIR] : 0.9968
##
## Kappa : 0.0806
##
## Mcnemar's Test P-Value : 0.1530
##
## Sensitivity : 0.4667
## Specificity : 0.6279
## Pos Pred Value : 0.3043
## Neg Pred Value : 0.7714
## Prevalence : 0.2586
## Detection Rate : 0.1207
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.5473
##
## 'Positive' Class : No
##
#SVM tune
svm_pca_tune <- tune(svm, train.x = train_pca %>% select(-readmitted),
train.y = train_pca$readmitted,
kernel = "radial", ranges = list(cost = 10^(-1:2), gamma = c(0.25, 0.5, 1,2)))
svm_pca_tune
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 100 2
##
## - best performance: 0.3552536
#SVM model using best cost and gamma parameters obtained after tuning.
classifier_cg_pca = svm(formula = as.factor(train_pca$readmitted)~., data = train_pca, type = "C-classification", kernel = "radial", cost = 100, gamma = 2)
#Predictions on test set
y_pred_cg_pca = predict(classifier_cg_pca, newdata = test_pca)
cm_cg_pca = table(test_pca$readmitted, y_pred_cg_pca)
cm_cg_pca
## y_pred_cg_pca
## No Yes
## No 9 14
## Yes 14 21
confusionMatrix(test_pca$readmitted, y_pred_cg_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9 14
## Yes 14 21
##
## Accuracy : 0.5172
## 95% CI : (0.3822, 0.6505)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.929
##
## Kappa : -0.0087
##
## Mcnemar's Test P-Value : 1.000
##
## Sensitivity : 0.3913
## Specificity : 0.6000
## Pos Pred Value : 0.3913
## Neg Pred Value : 0.6000
## Prevalence : 0.3966
## Detection Rate : 0.1552
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.4957
##
## 'Positive' Class : No
##
summary(classifier_cg_pca)
##
## Call:
## svm(formula = as.factor(train_pca$readmitted) ~ ., data = train_pca,
## type = "C-classification", kernel = "radial", cost = 100, gamma = 2)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 100
##
## Number of Support Vectors: 215
##
## ( 129 86 )
##
##
## Number of Classes: 2
##
## Levels:
## No Yes
#SVM PCA ROC
svm_pca_roc = roc(test_pca$readmitted, factor(y_pred_cg_pca, ordered = T), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#SVM PCA AUC
svm_pca_auc <- auc(svm_roc)
svm_pca_auc
## Area under the curve: 0.4857
#Append comparison lists
Model_Names <- append(Model_Names, "SVM PCA")
Model_AUC_Values <- append(Model_AUC_Values, svm_pca_auc)
K-nn using PCA data
#Determine best k value
vec_pca = c()
k_vec_pca = c()
for(k_pca in 1:80){
y_pred_knn_pca = knn(train = train_pca %>% select(-readmitted),
test = test_pca %>% select(-readmitted),
cl = train_pca$readmitted,
k=k)
error_pca = mean(y_pred_knn_pca != test_pca$readmitted)
k_vec_pca = c(k_vec_pca, k_pca)
vec_pca = c(vec_pca, error_pca)
}
cbind(vec_pca)
## vec_pca
## [1,] 0.3793103
## [2,] 0.3448276
## [3,] 0.3448276
## [4,] 0.3448276
## [5,] 0.3620690
## [6,] 0.3793103
## [7,] 0.3620690
## [8,] 0.3448276
## [9,] 0.3620690
## [10,] 0.3620690
## [11,] 0.3620690
## [12,] 0.3448276
## [13,] 0.3448276
## [14,] 0.3448276
## [15,] 0.3793103
## [16,] 0.3793103
## [17,] 0.3448276
## [18,] 0.3620690
## [19,] 0.3448276
## [20,] 0.3620690
## [21,] 0.3793103
## [22,] 0.3448276
## [23,] 0.3448276
## [24,] 0.3620690
## [25,] 0.3448276
## [26,] 0.3620690
## [27,] 0.3620690
## [28,] 0.3620690
## [29,] 0.3793103
## [30,] 0.3448276
## [31,] 0.3448276
## [32,] 0.3448276
## [33,] 0.3620690
## [34,] 0.3620690
## [35,] 0.3448276
## [36,] 0.3620690
## [37,] 0.3620690
## [38,] 0.3620690
## [39,] 0.3448276
## [40,] 0.3620690
## [41,] 0.3620690
## [42,] 0.3793103
## [43,] 0.3793103
## [44,] 0.3620690
## [45,] 0.3448276
## [46,] 0.3620690
## [47,] 0.3448276
## [48,] 0.3448276
## [49,] 0.3620690
## [50,] 0.3793103
## [51,] 0.3620690
## [52,] 0.3620690
## [53,] 0.3793103
## [54,] 0.3448276
## [55,] 0.3620690
## [56,] 0.3448276
## [57,] 0.3620690
## [58,] 0.3620690
## [59,] 0.3448276
## [60,] 0.3620690
## [61,] 0.3620690
## [62,] 0.3793103
## [63,] 0.3620690
## [64,] 0.3793103
## [65,] 0.3448276
## [66,] 0.3793103
## [67,] 0.3448276
## [68,] 0.3620690
## [69,] 0.3620690
## [70,] 0.3793103
## [71,] 0.3620690
## [72,] 0.3793103
## [73,] 0.3448276
## [74,] 0.3448276
## [75,] 0.3620690
## [76,] 0.3620690
## [77,] 0.3620690
## [78,] 0.3448276
## [79,] 0.3620690
## [80,] 0.3793103
min(vec_pca)
## [1] 0.3448276
df_error_pca = data.frame(k_vec_pca, vec_pca)
ggplot(df_error_pca, aes(k_vec_pca, vec_pca)) +geom_line()
#Extract minimum value
min_vec_pca <- k_vec_pca[which.min(vec_pca)]
min_vec_pca
## [1] 2
#K-nn model using best k value from graph
y_pred_k_pca = knn(train = train_pca %>% select(-readmitted),
test = test_pca %>% select(-readmitted),
cl = train_pca$readmitted,
k = min_vec_pca, prob = T)
y_pred_k_pca
## [1] Yes Yes No Yes Yes No No Yes Yes No No No Yes Yes Yes No Yes No No
## [20] No No No Yes No Yes Yes Yes No Yes Yes No Yes Yes No No No Yes Yes
## [39] Yes No No No No Yes Yes No No No No Yes Yes No Yes Yes No Yes No
## [58] Yes
## attr(,"prob")
## [1] 1.0 1.0 1.0 0.5 1.0 0.5 1.0 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5 1.0 0.5 0.5
## [20] 0.5 1.0 0.5 1.0 1.0 1.0 1.0 0.5 1.0 1.0 0.5 1.0 1.0 1.0 1.0 0.5 0.5 1.0 0.5
## [39] 0.5 1.0 0.5 0.5 0.5 1.0 1.0 1.0 0.5 1.0 0.5 1.0 1.0 0.5 1.0 1.0 0.5 1.0 0.5
## [58] 1.0
## Levels: No Yes
#Confusion matrix
confusionMatrix(test_pca$readmitted, y_pred_k_pca, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 13 10
## Yes 16 19
##
## Accuracy : 0.5517
## 95% CI : (0.4154, 0.6826)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.2559
##
## Kappa : 0.1034
##
## Mcnemar's Test P-Value : 0.3268
##
## Sensitivity : 0.6552
## Specificity : 0.4483
## Pos Pred Value : 0.5429
## Neg Pred Value : 0.5652
## Prevalence : 0.5000
## Detection Rate : 0.3276
## Detection Prevalence : 0.6034
## Balanced Accuracy : 0.5517
##
## 'Positive' Class : Yes
##
#K-nn PCA ROC
knn_pca_roc<- roc(test_pca$readmitted, attr(y_pred_k_pca, 'prob'), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#K-nn PCA AUC
knn_pca_auc<- auc(knn_pca_roc)
knn_pca_auc
## Area under the curve: 0.546
#Append comparison lists
Model_Names <- append(Model_Names, "K-nn PCA")
Model_AUC_Values <- append(Model_AUC_Values, knn_pca_auc)
Cross Validation
train_set$readmitted <- ifelse(train_set$readmitted == 1, "Yes", "No")
test_set$readmitted <- ifelse(test_set$readmitted == 1, "Yes", "No")
train_set$readmitted <- as.factor(train_set$readmitted)
test_set$readmitted <- as.factor(test_set$readmitted)
str(train_set$readmitted)
## Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...
Logistic Regression Cross Validation.
train_control <- trainControl(method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
lr_cv = train(readmitted ~., data = train_set,
method = "glm",
metric="ROC",
tuneLength = 10,
trControl = train_control)
lr_cv
## Generalized Linear Model
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 208, 208, 207, 208, 209, 208, ...
## Resampling results:
##
## ROC Sens Spec
## 0.6508852 0.4911111 0.6978022
lr_cv$results
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.6508852 0.4911111 0.6978022 0.1158645 0.203252 0.1317154
#Prediction of test data
lr_pred <- predict(lr_cv, test_set)
lr_pred
## [1] No No No Yes Yes No Yes Yes Yes No Yes No Yes No No No No No No
## [20] Yes Yes No Yes Yes No Yes Yes No Yes No No Yes Yes No Yes No Yes No
## [39] Yes No Yes Yes No No Yes No Yes No No Yes Yes No Yes No Yes Yes Yes
## [58] Yes
## Levels: No Yes
#Obtain probabilities
lr_pred_prob <- predict(lr_cv, test_set, type = "prob")
lr_pred_prob
## No Yes
## 461 0.675463352 0.32453665
## 824 0.947745309 0.05225469
## 962 0.826163335 0.17383667
## 1984 0.169275517 0.83072448
## 2309 0.259408539 0.74059146
## 5016 0.949737666 0.05026233
## 6129 0.091369048 0.90863095
## 6452 0.072585298 0.92741470
## 7698 0.079277404 0.92072260
## 9471 0.905648871 0.09435113
## 11728 0.240533615 0.75946638
## 15359 0.852050046 0.14794995
## 15538 0.247903978 0.75209602
## 17795 0.662634009 0.33736599
## 18511 0.510548178 0.48945182
## 20963 0.872472799 0.12752720
## 23943 0.733487687 0.26651231
## 25851 0.775771173 0.22422883
## 29060 0.516374021 0.48362598
## 29366 0.222229905 0.77777009
## 30649 0.102321559 0.89767844
## 31935 0.649185909 0.35081409
## 32130 0.085387095 0.91461290
## 34606 0.474587212 0.52541279
## 35975 0.505571313 0.49442869
## 36349 0.041702525 0.95829747
## 36685 0.157486296 0.84251370
## 39784 0.910804537 0.08919546
## 39833 0.288054914 0.71194509
## 40684 0.722441783 0.27755822
## 44259 0.561480202 0.43851980
## 44286 0.102238276 0.89776172
## 46721 0.207769314 0.79223069
## 51517 0.694022254 0.30597775
## 57291 0.453205894 0.54679411
## 57846 0.620038250 0.37996175
## 58435 0.036410839 0.96358916
## 60051 0.796717131 0.20328287
## 65125 0.039615248 0.96038475
## 66118 0.811508992 0.18849101
## 66395 0.284824737 0.71517526
## 68026 0.324725038 0.67527496
## 68269 0.622240232 0.37775977
## 68348 0.836533531 0.16346647
## 70681 0.046481004 0.95351900
## 71862 0.859933847 0.14006615
## 72980 0.008226799 0.99177320
## 74933 0.819798396 0.18020160
## 76453 0.681474155 0.31852585
## 76775 0.132532454 0.86746755
## 76814 0.178155427 0.82184457
## 77043 0.539392430 0.46060757
## 78993 0.167075654 0.83292435
## 79192 0.766071686 0.23392831
## 82660 0.113143933 0.88685607
## 84909 0.380328135 0.61967187
## 88479 0.484415642 0.51558436
## 101030 0.421614305 0.57838569
str(test_set$readmitted)
## Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
#Confusion Matrix
lr_cv_cm <- confusionMatrix(lr_pred, test_set$readmitted)
lr_cv_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 16 12
## Yes 7 23
##
## Accuracy : 0.6724
## 95% CI : (0.5366, 0.7899)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.1741
##
## Kappa : 0.3401
##
## Mcnemar's Test P-Value : 0.3588
##
## Sensitivity : 0.6957
## Specificity : 0.6571
## Pos Pred Value : 0.5714
## Neg Pred Value : 0.7667
## Prevalence : 0.3966
## Detection Rate : 0.2759
## Detection Prevalence : 0.4828
## Balanced Accuracy : 0.6764
##
## 'Positive' Class : No
##
#LR CV ROC
lr_cv_roc <- roc(test_set$readmitted, lr_pred_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#LR CV AUC
lr_cv_auc <- auc(lr_cv_roc)
lr_cv_auc
## Area under the curve: 0.7267
#Append comparison lists
Model_Names <- append(Model_Names, "Logistic Regression Cross Validation")
Model_AUC_Values <- append(Model_AUC_Values, lr_cv_auc)
Elastic Net Cross Validation.
glmnet_cv= train(readmitted ~., data = train_set,
method = "glmnet",
metric="ROC",
trControl = train_control)
glmnet_cv
## glmnet
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 208, 209, 207, 207, 208, 208, ...
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.10 0.0002334386 0.6260501 0.4977778 0.7197802
## 0.10 0.0023343864 0.6254640 0.4977778 0.7269231
## 0.10 0.0233438645 0.6481502 0.5200000 0.7483516
## 0.55 0.0002334386 0.6246215 0.5088889 0.7197802
## 0.55 0.0023343864 0.6307143 0.5088889 0.7192308
## 0.55 0.0233438645 0.6723138 0.5077778 0.7917582
## 1.00 0.0002334386 0.6261905 0.5088889 0.7197802
## 1.00 0.0023343864 0.6330220 0.5088889 0.7192308
## 1.00 0.0233438645 0.6845849 0.4633333 0.8274725
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02334386.
#Best parameters
glmnet_cv$bestTune
## alpha lambda
## 9 1 0.02334386
glmnet_cv$results
## alpha lambda ROC Sens Spec ROCSD SensSD
## 1 0.10 0.0002334386 0.6260501 0.4977778 0.7197802 0.1547494 0.1952663
## 2 0.10 0.0023343864 0.6254640 0.4977778 0.7269231 0.1510252 0.2088442
## 3 0.10 0.0233438645 0.6481502 0.5200000 0.7483516 0.1347680 0.1941391
## 4 0.55 0.0002334386 0.6246215 0.5088889 0.7197802 0.1573322 0.1950555
## 5 0.55 0.0023343864 0.6307143 0.5088889 0.7192308 0.1476213 0.2151211
## 6 0.55 0.0233438645 0.6723138 0.5077778 0.7917582 0.1170483 0.1773954
## 7 1.00 0.0002334386 0.6261905 0.5088889 0.7197802 0.1567712 0.1950555
## 8 1.00 0.0023343864 0.6330220 0.5088889 0.7192308 0.1384531 0.2086471
## 9 1.00 0.0233438645 0.6845849 0.4633333 0.8274725 0.1203069 0.1041064
## SpecSD
## 1 0.13017843
## 2 0.13602647
## 3 0.12526187
## 4 0.13017843
## 5 0.12546656
## 6 0.10192271
## 7 0.13017843
## 8 0.12546656
## 9 0.08901928
#Prediction on test set
glmnet_cv_pred <- predict(glmnet_cv, test_set)
glmnet_cv_pred
## [1] No No No Yes Yes No Yes Yes Yes No Yes No Yes Yes Yes No No No No
## [20] Yes Yes No Yes Yes Yes Yes Yes No Yes Yes Yes Yes Yes No Yes Yes Yes No
## [39] Yes No Yes Yes No No Yes No Yes No No Yes Yes No Yes No Yes Yes Yes
## [58] Yes
## Levels: No Yes
#Obtain probabilities
glmnet_cv_prob <- predict(glmnet_cv, test_set, type = "prob")
glmnet_cv_prob
## No Yes
## 1 0.57996883 0.4200312
## 2 0.69385962 0.3061404
## 3 0.66739281 0.3326072
## 4 0.28402550 0.7159745
## 5 0.35378848 0.6462115
## 6 0.71592726 0.2840727
## 7 0.21716228 0.7828377
## 8 0.25824862 0.7417514
## 9 0.24884354 0.7511565
## 10 0.74807346 0.2519265
## 11 0.27305866 0.7269413
## 12 0.72822752 0.2717725
## 13 0.34313393 0.6568661
## 14 0.45496872 0.5450313
## 15 0.45709405 0.5429059
## 16 0.64555475 0.3544452
## 17 0.65923921 0.3407608
## 18 0.55242522 0.4475748
## 19 0.51727656 0.4827234
## 20 0.37414121 0.6258588
## 21 0.31512401 0.6848760
## 22 0.50138927 0.4986107
## 23 0.15751985 0.8424801
## 24 0.49643791 0.5035621
## 25 0.44967050 0.5503295
## 26 0.15088027 0.8491197
## 27 0.30778563 0.6922144
## 28 0.73282226 0.2671777
## 29 0.22831025 0.7716897
## 30 0.49690211 0.5030979
## 31 0.44851411 0.5514859
## 32 0.23211403 0.7678860
## 33 0.36042008 0.6395799
## 34 0.59755350 0.4024465
## 35 0.34510891 0.6548911
## 36 0.48775666 0.5122433
## 37 0.18883683 0.8111632
## 38 0.60933876 0.3906612
## 39 0.13194910 0.8680509
## 40 0.64238675 0.3576133
## 41 0.29023603 0.7097640
## 42 0.39919973 0.6008003
## 43 0.54721781 0.4527822
## 44 0.66832706 0.3316729
## 45 0.13933073 0.8606693
## 46 0.72818275 0.2718172
## 47 0.06732332 0.9326767
## 48 0.72355879 0.2764412
## 49 0.54083040 0.4591696
## 50 0.25356765 0.7464324
## 51 0.26435849 0.7356415
## 52 0.53782960 0.4621704
## 53 0.24163410 0.7583659
## 54 0.61688131 0.3831187
## 55 0.27494688 0.7250531
## 56 0.33167862 0.6683214
## 57 0.42167999 0.5783200
## 58 0.42138281 0.5786172
#Confusion matrix
glmnet_cm <-confusionMatrix(glmnet_cv_pred, test_set$readmitted)
glmnet_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 14 8
## Yes 9 27
##
## Accuracy : 0.7069
## 95% CI : (0.5727, 0.8191)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.06804
##
## Kappa : 0.383
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.6087
## Specificity : 0.7714
## Pos Pred Value : 0.6364
## Neg Pred Value : 0.7500
## Prevalence : 0.3966
## Detection Rate : 0.2414
## Detection Prevalence : 0.3793
## Balanced Accuracy : 0.6901
##
## 'Positive' Class : No
##
#ElasticNet CV ROC
glmnet_cv_roc <- roc(test_set$readmitted, glmnet_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#ElasticNet CV AUC
glmnet_cv_auc <- auc(glmnet_cv_roc)
glmnet_cv_auc
## Area under the curve: 0.6994
#Append comparison lists
Model_Names <- append(Model_Names, "GLMNet Cross Validation")
Model_AUC_Values <- append(Model_AUC_Values, glmnet_cv_auc)
SVM Cross Validation.
SVM Radial Cross Validation
svmr_cv <- train(readmitted ~., data = train_set,
method = "svmRadial",
metric="ROC",
trControl = train_control,
preProcess = c("center", "scale"))
svmr_cv
## Support Vector Machines with Radial Basis Function Kernel
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (43), scaled (43)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 209, 207, 208, 208, 208, 209, ...
## Resampling results across tuning parameters:
##
## C ROC Sens Spec
## 0.25 0.6391514 0.3755556 0.8115385
## 0.50 0.6407387 0.3955556 0.8192308
## 1.00 0.6263004 0.2555556 0.8917582
##
## Tuning parameter 'sigma' was held constant at a value of 0.01305887
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01305887 and C = 0.5.
#Best Tuned parameters
svmr_cv$bestTune
## sigma C
## 2 0.01305887 0.5
svmr_cv$results
## sigma C ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.01305887 0.25 0.6391514 0.3755556 0.8115385 0.08099193 0.1242591 0.09054549
## 2 0.01305887 0.50 0.6407387 0.3955556 0.8192308 0.07913294 0.1525458 0.08981645
## 3 0.01305887 1.00 0.6263004 0.2555556 0.8917582 0.09661604 0.1296825 0.08476708
#Prediction on test set
svmr_cv_pred <- predict(svmr_cv, test_set)
svmr_cv_pred
## [1] No No No Yes Yes No Yes Yes Yes No Yes No Yes No No No No No No
## [20] No Yes No Yes No Yes Yes Yes No Yes No No Yes Yes No Yes No Yes No
## [39] Yes No Yes Yes No No Yes No Yes No No Yes Yes No Yes No Yes Yes No
## [58] Yes
## Levels: No Yes
#Obtain probabilities
svmr_cv_prob <- predict(svmr_cv, test_set, type = "prob")
svmr_cv_prob
## No Yes
## 1 0.5769947 0.4230053
## 2 0.5763076 0.4236924
## 3 0.6838934 0.3161066
## 4 0.3620863 0.6379137
## 5 0.3604649 0.6395351
## 6 0.7759164 0.2240836
## 7 0.3191321 0.6808679
## 8 0.2468887 0.7531113
## 9 0.2749433 0.7250567
## 10 0.6577235 0.3422765
## 11 0.4207962 0.5792038
## 12 0.6907564 0.3092436
## 13 0.2942528 0.7057472
## 14 0.5898328 0.4101672
## 15 0.5464387 0.4535613
## 16 0.5535083 0.4464917
## 17 0.6567153 0.3432847
## 18 0.6750684 0.3249316
## 19 0.6916219 0.3083781
## 20 0.5711105 0.4288895
## 21 0.4754318 0.5245682
## 22 0.5327080 0.4672920
## 23 0.1956482 0.8043518
## 24 0.5560474 0.4439526
## 25 0.3911894 0.6088106
## 26 0.1567603 0.8432397
## 27 0.2860177 0.7139823
## 28 0.7485627 0.2514373
## 29 0.3758562 0.6241438
## 30 0.5429425 0.4570575
## 31 0.6103655 0.3896345
## 32 0.2453747 0.7546253
## 33 0.3488706 0.6511294
## 34 0.7199452 0.2800548
## 35 0.4371471 0.5628529
## 36 0.5192925 0.4807075
## 37 0.2393439 0.7606561
## 38 0.7468459 0.2531541
## 39 0.2314607 0.7685393
## 40 0.7429177 0.2570823
## 41 0.3175687 0.6824313
## 42 0.2736421 0.7263579
## 43 0.6667483 0.3332517
## 44 0.7371417 0.2628583
## 45 0.1959194 0.8040806
## 46 0.5686266 0.4313734
## 47 0.2391227 0.7608773
## 48 0.8925172 0.1074828
## 49 0.6096774 0.3903226
## 50 0.3288979 0.6711021
## 51 0.4003295 0.5996705
## 52 0.6296448 0.3703552
## 53 0.2260528 0.7739472
## 54 0.5816134 0.4183866
## 55 0.2401978 0.7598022
## 56 0.3795123 0.6204877
## 57 0.5588951 0.4411049
## 58 0.4904895 0.5095105
#Confusion matrix
svmr_cv_cm <- confusionMatrix(svmr_cv_pred, test_set$readmitted)
svmr_cv_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 17 13
## Yes 6 22
##
## Accuracy : 0.6724
## 95% CI : (0.5366, 0.7899)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.1741
##
## Kappa : 0.3495
##
## Mcnemar's Test P-Value : 0.1687
##
## Sensitivity : 0.7391
## Specificity : 0.6286
## Pos Pred Value : 0.5667
## Neg Pred Value : 0.7857
## Prevalence : 0.3966
## Detection Rate : 0.2931
## Detection Prevalence : 0.5172
## Balanced Accuracy : 0.6839
##
## 'Positive' Class : No
##
#SVM (radial) Cross Validation ROC
svmr_cv_roc <- roc(test_set$readmitted, svmr_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#SVM (radial) Cross Validation AUC
svmr_cv_auc <- auc(svmr_cv_roc)
svmr_cv_auc
## Area under the curve: 0.6969
#Append comparison lists
Model_Names <- append(Model_Names, "SVM (radial) Cross Validation")
Model_AUC_Values <- append(Model_AUC_Values, svmr_cv_auc)
SVM Cross Validation
svml_cv <- train(readmitted ~., data = train_set,
method = "svmLinear",
metric="ROC",
trControl = train_control,
preProcess = c("center", "scale"))
svml_cv
## Support Vector Machines with Linear Kernel
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (43), scaled (43)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 208, 209, 208, 207, 208, 207, ...
## Resampling results:
##
## ROC Sens Spec
## 0.6521795 0.1811111 0.9274725
##
## Tuning parameter 'C' was held constant at a value of 1
#Best Tuned parameters
svml_cv$bestTune
## C
## 1 1
svml_cv$results
## C ROC Sens Spec ROCSD SensSD SpecSD
## 1 1 0.6521795 0.1811111 0.9274725 0.1350173 0.136992 0.06738334
#Prediction on test set
svml_cv_pred <- predict(svml_cv, test_set)
svml_cv_pred
## [1] Yes Yes Yes Yes Yes No Yes Yes Yes No Yes Yes Yes Yes Yes Yes Yes Yes Yes
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes
## [39] Yes Yes Yes Yes Yes Yes Yes No Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes
## [58] Yes
## Levels: No Yes
#Obtain probabilities
svml_cv_prob <- predict(svml_cv, test_set, type = "prob")
svml_cv_prob
## No Yes
## 1 0.4431314 0.5568686
## 2 0.4982408 0.5017592
## 3 0.4658163 0.5341837
## 4 0.3745219 0.6254781
## 5 0.3916940 0.6083060
## 6 0.5326142 0.4673858
## 7 0.3481217 0.6518783
## 8 0.3292040 0.6707960
## 9 0.3273604 0.6726396
## 10 0.5068224 0.4931776
## 11 0.3854271 0.6145729
## 12 0.4896421 0.5103579
## 13 0.3754099 0.6245901
## 14 0.4779784 0.5220216
## 15 0.4172089 0.5827911
## 16 0.4946418 0.5053582
## 17 0.4586103 0.5413897
## 18 0.4617907 0.5382093
## 19 0.4237267 0.5762733
## 20 0.3822677 0.6177323
## 21 0.3485692 0.6514308
## 22 0.4436891 0.5563109
## 23 0.3484069 0.6515931
## 24 0.4349139 0.5650861
## 25 0.4190052 0.5809948
## 26 0.3149672 0.6850328
## 27 0.3710468 0.6289532
## 28 0.5028279 0.4971721
## 29 0.3956450 0.6043550
## 30 0.4497018 0.5502982
## 31 0.4340911 0.5659089
## 32 0.3384045 0.6615955
## 33 0.3827347 0.6172653
## 34 0.4596682 0.5403318
## 35 0.4408209 0.5591791
## 36 0.4302368 0.5697632
## 37 0.3019393 0.6980607
## 38 0.4608635 0.5391365
## 39 0.3269750 0.6730250
## 40 0.4914957 0.5085043
## 41 0.4084308 0.5915692
## 42 0.3931231 0.6068769
## 43 0.4812504 0.5187496
## 44 0.4959837 0.5040163
## 45 0.3294059 0.6705941
## 46 0.5113351 0.4886649
## 47 0.3086960 0.6913040
## 48 0.4904065 0.5095935
## 49 0.4636852 0.5363148
## 50 0.3507997 0.6492003
## 51 0.3872374 0.6127626
## 52 0.4344101 0.5655899
## 53 0.3982462 0.6017538
## 54 0.4672895 0.5327105
## 55 0.3509703 0.6490297
## 56 0.4172474 0.5827526
## 57 0.4279689 0.5720311
## 58 0.4244512 0.5755488
#Confusion matrix
svml_cv_cm <- confusionMatrix(svml_cv_pred, test_set$readmitted)
svml_cv_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3 1
## Yes 20 34
##
## Accuracy : 0.6379
## 95% CI : (0.5012, 0.7601)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.3467
##
## Kappa : 0.1187
##
## Mcnemar's Test P-Value : 8.568e-05
##
## Sensitivity : 0.13043
## Specificity : 0.97143
## Pos Pred Value : 0.75000
## Neg Pred Value : 0.62963
## Prevalence : 0.39655
## Detection Rate : 0.05172
## Detection Prevalence : 0.06897
## Balanced Accuracy : 0.55093
##
## 'Positive' Class : No
##
#SVM (linear) Cross Validation ROC
svml_cv_roc <- roc(test_set$readmitted, svml_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#SVM (linear) Cross Validation AUC
svml_cv_auc <- auc(svml_cv_roc)
svml_cv_auc
## Area under the curve: 0.7155
#Append comparison lists
Model_Names <- append(Model_Names, "SVM (linear) Cross Validation ")
Model_AUC_Values <- append(Model_AUC_Values, svml_cv_auc)
K-nn Cross Validation.
#Model
knn_cv = train(readmitted ~., data = train_set,
method = "knn",
metric="ROC",
trControl = train_control)
knn_cv
## k-Nearest Neighbors
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 208, 207, 209, 208, 207, 207, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 5 0.5697833 0.4288889 0.6164835
## 7 0.5864866 0.4622222 0.6741758
## 9 0.6021215 0.4400000 0.6450549
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
knn_cv$bestTune
## k
## 3 9
knn_cv$results
## k ROC Sens Spec ROCSD SensSD SpecSD
## 1 5 0.5697833 0.4288889 0.6164835 0.1133132 0.2236497 0.11640631
## 2 7 0.5864866 0.4622222 0.6741758 0.1210569 0.2062933 0.08920185
## 3 9 0.6021215 0.4400000 0.6450549 0.1208528 0.2272397 0.09103135
#K-nn cv prediction on test set
knn_cv_pred <- predict(knn_cv, test_set)
knn_cv_pred
## [1] No Yes No Yes Yes No Yes Yes Yes No No No Yes No No No No No No
## [20] No Yes No No No Yes Yes Yes No Yes No No Yes Yes No No Yes Yes No
## [39] Yes No Yes Yes No No Yes Yes Yes No No Yes Yes No Yes Yes Yes Yes No
## [58] Yes
## Levels: No Yes
#Obtain probabilities
knn_cv_prob <- predict(knn_cv, test_set, type = "prob")
knn_cv_prob
## No Yes
## 1 0.5555556 0.4444444
## 2 0.4444444 0.5555556
## 3 0.6666667 0.3333333
## 4 0.3333333 0.6666667
## 5 0.4444444 0.5555556
## 6 0.7777778 0.2222222
## 7 0.3333333 0.6666667
## 8 0.1111111 0.8888889
## 9 0.3333333 0.6666667
## 10 0.6666667 0.3333333
## 11 0.7777778 0.2222222
## 12 0.5555556 0.4444444
## 13 0.3333333 0.6666667
## 14 0.6666667 0.3333333
## 15 0.5555556 0.4444444
## 16 0.5555556 0.4444444
## 17 0.7777778 0.2222222
## 18 0.6666667 0.3333333
## 19 0.7777778 0.2222222
## 20 0.6666667 0.3333333
## 21 0.4444444 0.5555556
## 22 0.5555556 0.4444444
## 23 0.5555556 0.4444444
## 24 0.5555556 0.4444444
## 25 0.4444444 0.5555556
## 26 0.2222222 0.7777778
## 27 0.2222222 0.7777778
## 28 0.5555556 0.4444444
## 29 0.4444444 0.5555556
## 30 1.0000000 0.0000000
## 31 0.6666667 0.3333333
## 32 0.1111111 0.8888889
## 33 0.1111111 0.8888889
## 34 0.6666667 0.3333333
## 35 0.6666667 0.3333333
## 36 0.4444444 0.5555556
## 37 0.3333333 0.6666667
## 38 0.6666667 0.3333333
## 39 0.4444444 0.5555556
## 40 0.6666667 0.3333333
## 41 0.3333333 0.6666667
## 42 0.3333333 0.6666667
## 43 0.6666667 0.3333333
## 44 0.5555556 0.4444444
## 45 0.1111111 0.8888889
## 46 0.4444444 0.5555556
## 47 0.3333333 0.6666667
## 48 0.8888889 0.1111111
## 49 0.8888889 0.1111111
## 50 0.4444444 0.5555556
## 51 0.0000000 1.0000000
## 52 0.7777778 0.2222222
## 53 0.1111111 0.8888889
## 54 0.4444444 0.5555556
## 55 0.4444444 0.5555556
## 56 0.2222222 0.7777778
## 57 0.5555556 0.4444444
## 58 0.2222222 0.7777778
#Confusion matrix
knn_cv_cm <- confusionMatrix(knn_cv_pred, test_set$readmitted)
knn_cv_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 16 13
## Yes 7 22
##
## Accuracy : 0.6552
## 95% CI : (0.5188, 0.7751)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.2529
##
## Kappa : 0.3103
##
## Mcnemar's Test P-Value : 0.2636
##
## Sensitivity : 0.6957
## Specificity : 0.6286
## Pos Pred Value : 0.5517
## Neg Pred Value : 0.7586
## Prevalence : 0.3966
## Detection Rate : 0.2759
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.6621
##
## 'Positive' Class : No
##
#K-nn Cross Validation ROC
knn_cv_roc <- roc(test_set$readmitted, knn_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#K-nn Cross Validation AUC
knn_cv_auc <- auc(knn_cv_roc)
knn_cv_auc
## Area under the curve: 0.646
#Append comparison lists
Model_Names <- append(Model_Names, "K-nn Cross Validation")
Model_AUC_Values <- append(Model_AUC_Values, knn_cv_auc)
Decision Tree Cross Validation.
#Model
dt_cv <- train(readmitted ~., data = train_set,
method = "rpart",
metric = "ROC",
trControl = train_control)
dt_cv
## CART
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 208, 207, 208, 208, 208, 208, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.02867384 0.5675061 0.3544444 0.7247253
## 0.03225806 0.5661935 0.2900000 0.7620879
## 0.17204301 0.5346825 0.1622222 0.9071429
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02867384.
dt_cv$results
## cp ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.02867384 0.5675061 0.3544444 0.7247253 0.08394223 0.1232226 0.1262395
## 2 0.03225806 0.5661935 0.2900000 0.7620879 0.07959142 0.1492046 0.1289630
## 3 0.17204301 0.5346825 0.1622222 0.9071429 0.05329365 0.2242011 0.1390362
#Best tune parameters
dt_cv$bestTune
## cp
## 1 0.02867384
#Prediction on test
dt_cv_pred <- predict(dt_cv, test_set)
dt_cv_pred
## [1] No No No No Yes No Yes Yes Yes No No No Yes Yes No No No No No
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No Yes Yes Yes Yes Yes Yes Yes Yes Yes No
## [39] Yes No Yes Yes No No Yes No Yes No No Yes Yes No Yes No Yes Yes Yes
## [58] Yes
## Levels: No Yes
#Obtain probabilities
dt_cv_prob <- predict(dt_cv, test_set, type = "prob")
dt_cv_prob
## No Yes
## 461 0.8571429 0.1428571
## 824 0.6363636 0.3636364
## 962 0.6176471 0.3823529
## 1984 0.6176471 0.3823529
## 2309 0.1612903 0.8387097
## 5016 0.8571429 0.1428571
## 6129 0.1612903 0.8387097
## 6452 0.2745098 0.7254902
## 7698 0.2800000 0.7200000
## 9471 0.8571429 0.1428571
## 11728 0.6176471 0.3823529
## 15359 0.6176471 0.3823529
## 15538 0.2800000 0.7200000
## 17795 0.1612903 0.8387097
## 18511 0.6176471 0.3823529
## 20963 0.6176471 0.3823529
## 23943 0.6176471 0.3823529
## 25851 0.6176471 0.3823529
## 29060 1.0000000 0.0000000
## 29366 0.1612903 0.8387097
## 30649 0.2745098 0.7254902
## 31935 0.2745098 0.7254902
## 32130 0.1612903 0.8387097
## 34606 0.2745098 0.7254902
## 35975 0.2800000 0.7200000
## 36349 0.1612903 0.8387097
## 36685 0.2745098 0.7254902
## 39784 0.6176471 0.3823529
## 39833 0.1612903 0.8387097
## 40684 0.2745098 0.7254902
## 44259 0.2745098 0.7254902
## 44286 0.1612903 0.8387097
## 46721 0.2800000 0.7200000
## 51517 0.2745098 0.7254902
## 57291 0.1612903 0.8387097
## 57846 0.1612903 0.8387097
## 58435 0.1612903 0.8387097
## 60051 0.6176471 0.3823529
## 65125 0.1612903 0.8387097
## 66118 0.8571429 0.1428571
## 66395 0.2800000 0.7200000
## 68026 0.2745098 0.7254902
## 68269 0.6176471 0.3823529
## 68348 0.6176471 0.3823529
## 70681 0.1612903 0.8387097
## 71862 0.8571429 0.1428571
## 72980 0.1612903 0.8387097
## 74933 0.6176471 0.3823529
## 76453 0.6176471 0.3823529
## 76775 0.2745098 0.7254902
## 76814 0.2745098 0.7254902
## 77043 0.6176471 0.3823529
## 78993 0.2800000 0.7200000
## 79192 0.6176471 0.3823529
## 82660 0.1612903 0.8387097
## 84909 0.2800000 0.7200000
## 88479 0.2745098 0.7254902
## 101030 0.2800000 0.7200000
#DT CV confusion matrix
dt_cv_cm <- confusionMatrix(dt_cv_pred, test_set$readmitted)
dt_cv_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 13 10
## Yes 10 25
##
## Accuracy : 0.6552
## 95% CI : (0.5188, 0.7751)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.2529
##
## Kappa : 0.2795
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5652
## Specificity : 0.7143
## Pos Pred Value : 0.5652
## Neg Pred Value : 0.7143
## Prevalence : 0.3966
## Detection Rate : 0.2241
## Detection Prevalence : 0.3966
## Balanced Accuracy : 0.6398
##
## 'Positive' Class : No
##
#DT CV ROC
dt_cv_roc <- roc(test_set$readmitted, dt_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#DT CV AUC
dt_cv_auc <- auc(dt_cv_roc)
dt_cv_auc
## Area under the curve: 0.7081
#Append comparison lists
Model_Names <- append(Model_Names, "Decision Tree Cross Validation")
Model_AUC_Values <- append(Model_AUC_Values, dt_cv_auc)
Random Forest Cross Validation.
#Model
rf_cv <- train(readmitted~., data = train_set,
method = "rf",
metric = "ROC",
trControl = train_control)
rf_cv
## Random Forest
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 207, 208, 208, 209, 208, 207, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.6322436 0.2477778 0.8543956
## 22 0.6576374 0.3955556 0.7609890
## 43 0.6564713 0.3966667 0.7313187
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 22.
rf_cv$results
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.6322436 0.2477778 0.8543956 0.03614236 0.05291243 0.07723858
## 2 22 0.6576374 0.3955556 0.7609890 0.06861013 0.14327159 0.07614512
## 3 43 0.6564713 0.3966667 0.7313187 0.08186449 0.13011022 0.06126367
rf_cv$bestTune
## mtry
## 2 22
#RF CV prediction on test data
rf_cv_pred <- predict(rf_cv, test_set)
rf_cv_pred
## [1] Yes No No Yes Yes No Yes Yes Yes No No No Yes Yes Yes No No No Yes
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No Yes Yes Yes Yes Yes No Yes Yes Yes No
## [39] Yes Yes Yes Yes No No Yes No Yes No No Yes Yes No Yes No Yes Yes Yes
## [58] Yes
## Levels: No Yes
#Obtain probabilities
rf_cv_prob <- predict(rf_cv, test_set, type = "prob")
rf_cv_prob
## No Yes
## 461 0.312 0.688
## 824 0.684 0.316
## 962 0.780 0.220
## 1984 0.398 0.602
## 2309 0.360 0.640
## 5016 0.734 0.266
## 6129 0.250 0.750
## 6452 0.374 0.626
## 7698 0.218 0.782
## 9471 0.784 0.216
## 11728 0.504 0.496
## 15359 0.556 0.444
## 15538 0.204 0.796
## 17795 0.278 0.722
## 18511 0.498 0.502
## 20963 0.736 0.264
## 23943 0.764 0.236
## 25851 0.618 0.382
## 29060 0.460 0.540
## 29366 0.372 0.628
## 30649 0.424 0.576
## 31935 0.462 0.538
## 32130 0.118 0.882
## 34606 0.462 0.538
## 35975 0.258 0.742
## 36349 0.074 0.926
## 36685 0.208 0.792
## 39784 0.708 0.292
## 39833 0.118 0.882
## 40684 0.438 0.562
## 44259 0.384 0.616
## 44286 0.204 0.796
## 46721 0.338 0.662
## 51517 0.576 0.424
## 57291 0.464 0.536
## 57846 0.376 0.624
## 58435 0.300 0.700
## 60051 0.640 0.360
## 65125 0.168 0.832
## 66118 0.494 0.506
## 66395 0.332 0.668
## 68026 0.272 0.728
## 68269 0.830 0.170
## 68348 0.666 0.334
## 70681 0.170 0.830
## 71862 0.644 0.356
## 72980 0.096 0.904
## 74933 0.838 0.162
## 76453 0.620 0.380
## 76775 0.306 0.694
## 76814 0.350 0.650
## 77043 0.546 0.454
## 78993 0.142 0.858
## 79192 0.512 0.488
## 82660 0.358 0.642
## 84909 0.388 0.612
## 88479 0.448 0.552
## 101030 0.314 0.686
#RF CV confusion matrix
rf_cv_cm <- confusionMatrix(rf_cv_pred, test_set$readmitted)
rf_cv_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 11 8
## Yes 12 27
##
## Accuracy : 0.6552
## 95% CI : (0.5188, 0.7751)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.2529
##
## Kappa : 0.2574
##
## Mcnemar's Test P-Value : 0.5023
##
## Sensitivity : 0.4783
## Specificity : 0.7714
## Pos Pred Value : 0.5789
## Neg Pred Value : 0.6923
## Prevalence : 0.3966
## Detection Rate : 0.1897
## Detection Prevalence : 0.3276
## Balanced Accuracy : 0.6248
##
## 'Positive' Class : No
##
#RF CV ROC
rf_cv_roc <- roc(test_set$readmitted, rf_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#RF CV AUC
rf_cv_auc <- auc(dt_cv_roc)
rf_cv_auc
## Area under the curve: 0.7081
#Append comparison lists
Model_Names <- append(Model_Names, "Random Forest Cross Validation")
Model_AUC_Values <- append(Model_AUC_Values, rf_cv_auc)
XGBoost Cross Validation
#Setting hyperparameters
xgb_grid = expand.grid(nrounds = 100,
eta = 0.3,
max_depth = 1,
gamma = 0,
colsample_bytree = 0.6,
min_child_weight = 1,
subsample = 0.5)
#Model
xgb_cv <- train(readmitted~., data = train_variance,
method = "xgbTree",
metric = "ROC",
tuneGrid = xgb_grid,
trControl = train_control)
xgb_cv
## eXtreme Gradient Boosting
##
## 231 samples
## 43 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 208, 207, 208, 209, 208, 208, ...
## Resampling results:
##
## ROC Sens Spec
## 0.6592857 0.4966667 0.7032967
##
## Tuning parameter 'nrounds' was held constant at a value of 100
## Tuning
## held constant at a value of 1
## Tuning parameter 'subsample' was held
## constant at a value of 0.5
xgb_cv$results
## nrounds eta max_depth gamma colsample_bytree min_child_weight subsample
## 1 100 0.3 1 0 0.6 1 0.5
## ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.6592857 0.4966667 0.7032967 0.1259415 0.2499959 0.1437466
xgb_cv$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1 100 1 0.3 0 0.6 1 0.5
#Prediction of test set
xgb_cv_pred <- predict(xgb_cv, test_variance)
xgb_cv_pred
## [1] Yes No No Yes Yes No Yes Yes Yes No Yes No Yes Yes No No No No No
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No Yes No No Yes Yes Yes Yes Yes Yes No
## [39] Yes No Yes Yes No No Yes No Yes No No Yes Yes No Yes No Yes Yes Yes
## [58] Yes
## Levels: No Yes
#Obtain probabilities
xgb_cv_prob <- predict(xgb_cv, test_variance, type = "prob")
xgb_cv_prob
## No Yes
## 1 0.32917005 0.67082995
## 2 0.86825436 0.13174564
## 3 0.82599509 0.17400491
## 4 0.24610515 0.75389485
## 5 0.21723588 0.78276412
## 6 0.92182052 0.07817948
## 7 0.04456132 0.95543868
## 8 0.22743192 0.77256808
## 9 0.09605937 0.90394063
## 10 0.90615952 0.09384048
## 11 0.11220569 0.88779431
## 12 0.92583340 0.07416660
## 13 0.37088314 0.62911686
## 14 0.19849150 0.80150850
## 15 0.58358234 0.41641766
## 16 0.90069807 0.09930193
## 17 0.85694236 0.14305764
## 18 0.80964696 0.19035304
## 19 0.52580488 0.47419512
## 20 0.30859825 0.69140175
## 21 0.17238551 0.82761449
## 22 0.31953821 0.68046179
## 23 0.12107700 0.87892300
## 24 0.30053511 0.69946489
## 25 0.25890693 0.74109307
## 26 0.06948786 0.93051214
## 27 0.31062853 0.68937147
## 28 0.86036950 0.13963050
## 29 0.31823820 0.68176180
## 30 0.51464057 0.48535943
## 31 0.68005693 0.31994307
## 32 0.30210975 0.69789025
## 33 0.30628887 0.69371113
## 34 0.42075387 0.57924613
## 35 0.28659305 0.71340695
## 36 0.49482372 0.50517628
## 37 0.08175056 0.91824944
## 38 0.66953462 0.33046538
## 39 0.03546740 0.96453260
## 40 0.71574247 0.28425753
## 41 0.41602308 0.58397692
## 42 0.32529932 0.67470068
## 43 0.71841925 0.28158075
## 44 0.72792310 0.27207690
## 45 0.05762353 0.94237647
## 46 0.74586344 0.25413656
## 47 0.18621323 0.81378677
## 48 0.91335452 0.08664548
## 49 0.68886632 0.31113368
## 50 0.09414817 0.90585183
## 51 0.14021052 0.85978948
## 52 0.51479864 0.48520136
## 53 0.21080305 0.78919695
## 54 0.74617946 0.25382054
## 55 0.14976101 0.85023899
## 56 0.34436220 0.65563780
## 57 0.45173427 0.54826573
## 58 0.28236154 0.71763846
#XGB CV confusion matrix
xgb_cv_cm <- confusionMatrix(xgb_cv_pred, test_variance$readmitted, positive = "Yes")
xgb_cv_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 14 8
## Yes 9 27
##
## Accuracy : 0.7069
## 95% CI : (0.5727, 0.8191)
## No Information Rate : 0.6034
## P-Value [Acc > NIR] : 0.06804
##
## Kappa : 0.383
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.7714
## Specificity : 0.6087
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.6364
## Prevalence : 0.6034
## Detection Rate : 0.4655
## Detection Prevalence : 0.6207
## Balanced Accuracy : 0.6901
##
## 'Positive' Class : Yes
##
xgb_cv_roc <- roc(test_variance$readmitted, xgb_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
#XGB CV AUC
xgb_cv_auc <- auc(xgb_cv_roc)
xgb_cv_auc
## Area under the curve: 0.7155
#Append comparison lists
Model_Names <- append(Model_Names, "XGBoost Cross Validation")
Model_AUC_Values <- append(Model_AUC_Values, xgb_cv_auc)
Model Comparison
Model AUC table
model_compare <- data.frame(Model_Names, Model_AUC_Values)
model_compare
## Model_Names Model_AUC_Values
## 1 Logistic Regression 0.7378882
## 2 GLMNet 0.7093168
## 3 Decision Tree 0.6260870
## 4 SVM 0.4857143
## 5 K-nn 0.5329193
## 6 XGBoost 0.6496894
## 7 Logistic Regression PCA 0.6447205
## 8 SVM PCA 0.4857143
## 9 K-nn PCA 0.5459627
## 10 Logistic Regression Cross Validation 0.7267081
## 11 GLMNet Cross Validation 0.6993789
## 12 SVM (radial) Cross Validation 0.6968944
## 13 SVM (linear) Cross Validation 0.7155280
## 14 K-nn Cross Validation 0.6459627
## 15 Decision Tree Cross Validation 0.7080745
## 16 Random Forest Cross Validation 0.7080745
## 17 XGBoost Cross Validation 0.7155280
Models ROC plots
log_reg_plot = plot(lr_roc, col = "#ce7e00")
text(x = 1.2, y = 0.2, labels = paste("AUC =", round(auc(lr_roc), 2)), col = "#ce7e00")
dt_plot = plot(dt_roc, add = TRUE, col = "#ffbf00")
text(x = 1.2, y = 0.3, labels = paste("AUC =", round(auc(dt_roc), 2)), col = "#ffbf00")
rf_plot = plot(rf_roc, add = TRUE, col = "#0b5394")
text(x = 1.2, y = 0.4, labels = paste("AUC =", round(auc(rf_roc), 2)), col = "#0b5394")
svm_plot = plot(svm_roc, add = TRUE, col = "#741b47")
text(x = 1.2, y = 0.5, labels = paste("AUC =", round(auc(svm_roc), 2)), col = "#741b47")
knn_plot = plot(knn_roc, add = TRUE, col = "#38761d")
text(x = 1.2, y = 0.6, labels = paste("AUC =", round(auc(knn_roc), 2)), col = "#38761d")
xgb_plot = plot(xgb_roc, add= TRUE, col = "#1ecdc9")
text(x = 1.2, y = 0.7, labels = paste("AUC =", round(auc(xgb_roc), 2)), col = "#1ecdc9")
glmnet_plot = plot(glmnet_roc, add = TRUE, col = "#ff95be")
text(x = 1.2, y = 0.8, labels = paste("AUC =", round(auc(glmnet_roc), 2)), col = "#ff95be")
PCA Model ROC Plots
log_reg_pca_plot = plot(lr_pca_roc, col = "#ce7e00")
text(x = 1.2, y = 0.2, labels = paste("AUC =", round(auc(lr_pca_roc), 2)), col = "#ce7e00")
svm_pca_plot = plot(svm_pca_roc, add = TRUE, col = "#741b47")
text(x = 1.2, y = 0.4, labels = paste("AUC =", round(auc(svm_pca_roc), 2)), col = "#741b47")
knn_pca_plot = plot(knn_pca_roc, add = TRUE, col = "#38761d")
text(x = 1.2, y = 0.6, labels = paste("AUC =", round(auc(knn_pca_roc), 2)), col = "#38761d")
CV Model ROC Plots
lr_cv_plot = plot(lr_cv_roc, col = "#4d982c")
text(x = 1.2, y = 0.2, labels = paste("AUC =", round(auc(lr_cv_roc), 2)), col = "#4d982c")
dt_cv_plot = plot(dt_cv_roc, add = TRUE, col = "#ffbf00")
text(x = 1.2, y = 0.3, labels = paste("AUC =", round(auc(dt_cv_roc), 2)), col = "#ffbf00")
rf_cv_plot = plot(rf_cv_roc, add = TRUE, col = "#0b5394")
text(x = 1.2, y = 0.4, labels = paste("AUC =", round(auc(rf_cv_roc), 2)), col = "#0b5394")
svmr_cv_plot = plot(svmr_cv_roc, add = TRUE, col = "#741b47")
text(x = 1.2, y = 0.5, labels = paste("AUC =", round(auc(svmr_cv_roc), 2)), col = "#741b47")
svml_cv_plot = plot(svml_cv_roc, add = TRUE, col = "#a23ef3")
text(x = 1.2, y = 0.6, labels = paste("AUC =", round(auc(svml_cv_roc), 2)), col = "#a23ef3")
knn_cv_plot = plot(knn_cv_roc, add = TRUE, col = "#38761d")
text(x = 1.2, y = 0.7, labels = paste("AUC =", round(auc(knn_cv_roc), 2)), col = "#38761d")
xgb_cv_plot = plot(xgb_cv_roc, add= TRUE, col = "#1ecdc9")
text(x = 1.2, y = 0.8, labels = paste("AUC =", round(auc(xgb_cv_roc), 2)), col = "#1ecdc9")
glmnet_cv_plot = plot(glmnet_cv_roc, add = TRUE, col = "#ce7e00")
text(x = 1.2, y = 0.9, labels = paste("AUC =", round(auc(glmnet_cv_roc), 2)), col = "#ce7e00")
Performance Summary The aim of project was to predict hospital readmission based on several patient related predictors. The data-set contained missing/invalid observations which were removed as the data-set contained relatively large number of observations. The information from all factor predictor variables was incorporated in the data-set by creating their dummy variables. Thus final data-set comprised of all originally numeric + dummy variables and contained 44 predictors and 289 observations . Unsupervised techniques; Hierarchical and K-means Clustering were used to visualize grouping of similar data. Dimensionality Reduction was performed using PCA feature extraction technique. The PCA data and expanded data sets were then used for performance evaluation using several Supervised Learning Algorithms and Cross Validation techniques. All model performances were compared using the AUC metric. Based on outcomes of all models above, for prediction of readmission, Logistic Regression Model with time_in_hospital,num_lab_procedures,number_inpatient and Other_diag as significant predictors was deemed to be the best performing model at AUC of 74%.